diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index b8c6788e0bc03..e1d7b7fa85221 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1088,8 +1088,8 @@ enum NodeType { /// These treat -0 as ordered less than +0, matching the behavior of IEEE-754 /// 2019's minimumNumber/maximumNumber. /// - /// Deprecated, and will be removed soon, as FMINNUM/FMAXNUM have the same - /// semantics now. + /// Some ISAs have such instructions, such as AArch64, MIPSr6, LoongArch etc. + /// Normally these ISAs claim that they have maxNum/minNum of IEEE754-2008. FMINNUM_IEEE, FMAXNUM_IEEE, diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 99968baec98e4..d0a8e6910bbf3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8801,50 +8801,43 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node, SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, SelectionDAG &DAG) const { - if (SDValue Expanded = expandVectorNaryOpBySplitting(Node, DAG)) - return Expanded; - EVT VT = Node->getValueType(0); - if (VT.isScalableVector()) - report_fatal_error( - "Expanding fminnum/fmaxnum for scalable vectors is undefined."); - SDLoc dl(Node); - unsigned NewOp = - Node->getOpcode() == ISD::FMINNUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; - - if (isOperationLegalOrCustom(NewOp, VT)) { - SDValue Quiet0 = Node->getOperand(0); - SDValue Quiet1 = Node->getOperand(1); + SDValue Op0 = Node->getOperand(0); + SDValue Op1 = Node->getOperand(1); + SDNodeFlags Flags = Node->getFlags(); + unsigned Opc = Node->getOpcode(); - if (!Node->getFlags().hasNoNaNs()) { - // Insert canonicalizes if it's possible we need to quiet to get correct - // sNaN behavior. - if (!DAG.isKnownNeverSNaN(Quiet0)) { - Quiet0 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0, - Node->getFlags()); - } - if (!DAG.isKnownNeverSNaN(Quiet1)) { - Quiet1 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1, - Node->getFlags()); - } - } + unsigned NewOp = Opc == ISD::FMINNUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; + if (isOperationLegalOrCustom(NewOp, VT)) + return DAG.getNode(NewOp, dl, VT, Op0, Op1, Flags); - return DAG.getNode(NewOp, dl, VT, Quiet0, Quiet1, Node->getFlags()); + // If the target has FMINIMUM/FMAXIMUM but not FMINNUM/FMAXNUM use that + // instead if there are no NaNs. + if (Flags.hasNoNaNs() || + (DAG.isKnownNeverNaN(Op0) && DAG.isKnownNeverNaN(Op1))) { + unsigned IEEE2019Op = Opc == ISD::FMINNUM ? ISD::FMINIMUM : ISD::FMAXIMUM; + if (isOperationLegalOrCustom(IEEE2019Op, VT)) + return DAG.getNode(IEEE2019Op, dl, VT, Op0, Op1, Flags); } - // If the target has FMINIMUM/FMAXIMUM but not FMINNUM/FMAXNUM use that + // If the target has FMINIMUMNUM/FMAXIMUMNUM but not FMINNUM/FMAXNUM use that // instead if there are no NaNs. - if (Node->getFlags().hasNoNaNs() || - (DAG.isKnownNeverNaN(Node->getOperand(0)) && - DAG.isKnownNeverNaN(Node->getOperand(1)))) { - unsigned IEEE2018Op = - Node->getOpcode() == ISD::FMINNUM ? ISD::FMINIMUM : ISD::FMAXIMUM; - if (isOperationLegalOrCustom(IEEE2018Op, VT)) - return DAG.getNode(IEEE2018Op, dl, VT, Node->getOperand(0), - Node->getOperand(1), Node->getFlags()); + if (Flags.hasNoNaNs() || + (DAG.isKnownNeverSNaN(Op0) && DAG.isKnownNeverSNaN(Op1))) { + unsigned IEEE2019NumOp = + Opc == ISD::FMINNUM ? ISD::FMINIMUMNUM : ISD::FMAXIMUMNUM; + if (isOperationLegalOrCustom(IEEE2019NumOp, VT)) + return DAG.getNode(IEEE2019NumOp, dl, VT, Op0, Op1, Flags); } + if (SDValue Expanded = expandVectorNaryOpBySplitting(Node, DAG)) + return Expanded; + + if (VT.isScalableVector()) + report_fatal_error( + "Expanding fminnum/fmaxnum for scalable vectors is undefined."); + if (SDValue SelCC = createSelectForFMINNUM_FMAXNUM(Node, DAG)) return SelCC; diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 94efba4014ec5..ddb88c7b98564 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -3514,7 +3514,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: s_get_pc_i64 s[0:1] ; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store@gotpcrel+4 ; GFX1250-NEXT: v_writelane_b32 v4, s30, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv ; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 ; GFX1250-NEXT: v_writelane_b32 v4, s31, 1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3758,7 +3758,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: s_get_pc_i64 s[0:1] ; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 ; GFX1250-NEXT: v_writelane_b32 v4, s30, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv ; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 ; GFX1250-NEXT: v_writelane_b32 v4, s31, 1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -4022,7 +4022,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: s_get_pc_i64 s[0:1] ; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 ; GFX1250-NEXT: v_writelane_b32 v5, s30, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv ; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 ; GFX1250-NEXT: v_mov_b32_e32 v4, v2 ; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 @@ -4300,7 +4300,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: s_get_pc_i64 s[0:1] ; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 ; GFX1250-NEXT: v_writelane_b32 v5, s30, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv ; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 ; GFX1250-NEXT: v_mov_b32_e32 v4, v2 ; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 @@ -4616,7 +4616,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: s_get_pc_i64 s[0:1] ; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 ; GFX1250-NEXT: v_writelane_b32 v5, s30, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv ; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 ; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5019,7 +5019,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: s_get_pc_i64 s[0:1] ; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 ; GFX1250-NEXT: v_writelane_b32 v9, s30, 0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 nv ; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 ; GFX1250-NEXT: v_writelane_b32 v9, s31, 1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -19458,8 +19458,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -19469,8 +19467,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -19599,10 +19595,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v2, v3, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 @@ -19614,13 +19606,9 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_min_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 @@ -19781,12 +19769,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v1, v1, v3 ; GCN-NEXT: v_min_f32_e32 v3, v5, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -19800,18 +19782,12 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -20025,14 +20001,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v4, v5, v4 ; GCN-NEXT: v_min_f32_e32 v1, v1, v3 ; GCN-NEXT: v_min_f32_e32 v3, v7, v6 @@ -20050,21 +20018,13 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_min_f32_e32 v4, v5, v4 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 @@ -20331,22 +20291,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v8, v9, v8 ; GCN-NEXT: v_min_f32_e32 v3, v3, v7 ; GCN-NEXT: v_min_f32_e32 v7, v11, v10 @@ -20372,41 +20316,25 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_min_f32_e32 v8, v9, v8 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v9, v7 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_min_f32_e32 v6, v9, v6 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_min_f32_e32 v5, v9, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_min_f32_e32 v5, v9, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 @@ -20871,74 +20799,42 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_min_f32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_min_f32_e32 v7, v7, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 ; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_min_f32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_min_f32_e32 v6, v6, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_min_f32_e32 v15, v15, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_min_f32_e32 v5, v5, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_min_f32_e32 v14, v14, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_min_f32_e32 v4, v4, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_min_f32_e32 v13, v13, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_min_f32_e32 v3, v3, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_min_f32_e32 v12, v12, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_min_f32_e32 v2, v2, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v11, v11, v17 ; GCN-NEXT: v_min_f32_e32 v1, v1, v9 ; GCN-NEXT: v_min_f32_e32 v9, v10, v19 @@ -20968,81 +20864,49 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_min_f32_e32 v16, v17, v16 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_min_f32_e32 v15, v17, v15 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_min_f32_e32 v14, v17, v14 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_min_f32_e32 v13, v17, v13 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_min_f32_e32 v12, v17, v12 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v11, v17, v11 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_min_f32_e32 v10, v17, v10 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_min_f32_e32 v9, v17, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_min_f32_e32 v9, v17, v9 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 @@ -21904,162 +21768,100 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v31, v32, v31 ; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_min_f32_e32 v14, v14, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v30, v32, v30 ; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_min_f32_e32 v13, v13, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v29, v32, v29 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_min_f32_e32 v12, v12, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v28, v32, v28 ; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_min_f32_e32 v11, v11, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v27, v32, v27 ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_min_f32_e32 v10, v10, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v26, v32, v26 ; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_min_f32_e32 v9, v9, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v25, v32, v25 ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_min_f32_e32 v8, v8, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v24, v32, v24 ; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_min_f32_e32 v7, v7, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v23, v32, v23 ; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_min_f32_e32 v6, v6, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v22, v32, v22 ; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_min_f32_e32 v5, v5, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v21, v32, v21 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_min_f32_e32 v4, v4, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v20, v32, v20 ; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_min_f32_e32 v3, v3, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v19, v32, v19 ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_min_f32_e32 v2, v2, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v18, v32, v18 ; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_min_f32_e32 v1, v1, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v17, v32, v17 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v0, v0, v16 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GCN-NEXT: v_alignbit_b32 v0, v17, v0, 16 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 ; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 @@ -22090,8 +21892,6 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_min_f32_e32 v17, v17, v18 ; GCN-NEXT: v_min_f32_e32 v15, v15, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v17 @@ -22106,163 +21906,99 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_min_f32_e32 v31, v32, v31 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_min_f32_e32 v30, v32, v30 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_min_f32_e32 v29, v32, v29 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_min_f32_e32 v28, v32, v28 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_min_f32_e32 v28, v32, v28 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_min_f32_e32 v27, v27, v33 -; GFX7-NEXT: v_min_f32_e32 v15, v15, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v26 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_min_f32_e32 v32, v33, v32 +; GFX7-NEXT: v_min_f32_e32 v27, v32, v27 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_min_f32_e32 v26, v33, v26 +; GFX7-NEXT: v_min_f32_e32 v26, v32, v26 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_min_f32_e32 v25, v33, v25 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 +; GFX7-NEXT: v_min_f32_e32 v25, v32, v25 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_min_f32_e32 v32, v32, v33 +; GFX7-NEXT: v_min_f32_e32 v15, v15, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_min_f32_e32 v24, v33, v24 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_min_f32_e32 v23, v33, v23 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_min_f32_e32 v22, v33, v22 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_min_f32_e32 v21, v33, v21 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_min_f32_e32 v20, v33, v20 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v19, v33, v19 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_min_f32_e32 v18, v33, v18 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_min_f32_e32 v17, v33, v17 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_min_f32_e32 v17, v33, v17 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v16 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v17 ; GFX7-NEXT: v_alignbit_b32 v0, v16, v0, 16 @@ -22284,11 +22020,11 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-NEXT: v_alignbit_b32 v8, v16, v8, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v26 ; GFX7-NEXT: v_alignbit_b32 v9, v16, v9, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v27 ; GFX7-NEXT: v_alignbit_b32 v10, v16, v10, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v28 ; GFX7-NEXT: v_alignbit_b32 v11, v16, v11, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v32 ; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v31 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v30 ; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v29 @@ -23902,8 +23638,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -23913,8 +23647,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -24043,10 +23775,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v2, v3, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 @@ -24058,13 +23786,9 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 @@ -24225,12 +23949,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v1, v1, v3 ; GCN-NEXT: v_max_f32_e32 v3, v5, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -24244,18 +23962,12 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -24469,14 +24181,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v4, v5, v4 ; GCN-NEXT: v_max_f32_e32 v1, v1, v3 ; GCN-NEXT: v_max_f32_e32 v3, v7, v6 @@ -24494,21 +24198,13 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 @@ -24775,22 +24471,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v8, v9, v8 ; GCN-NEXT: v_max_f32_e32 v3, v3, v7 ; GCN-NEXT: v_max_f32_e32 v7, v11, v10 @@ -24816,41 +24496,25 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_max_f32_e32 v8, v9, v8 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v9, v7 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_max_f32_e32 v6, v9, v6 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_max_f32_e32 v5, v9, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_max_f32_e32 v5, v9, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 @@ -25315,74 +24979,42 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_max_f32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_max_f32_e32 v7, v7, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 ; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_max_f32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_max_f32_e32 v6, v6, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_max_f32_e32 v15, v15, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_max_f32_e32 v5, v5, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_max_f32_e32 v14, v14, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_max_f32_e32 v4, v4, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_max_f32_e32 v13, v13, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_max_f32_e32 v3, v3, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_max_f32_e32 v12, v12, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_max_f32_e32 v2, v2, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v11, v11, v17 ; GCN-NEXT: v_max_f32_e32 v1, v1, v9 ; GCN-NEXT: v_max_f32_e32 v9, v10, v19 @@ -25412,81 +25044,49 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_max_f32_e32 v16, v17, v16 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_max_f32_e32 v15, v17, v15 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_max_f32_e32 v14, v17, v14 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_max_f32_e32 v13, v17, v13 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_max_f32_e32 v12, v17, v12 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v11, v17, v11 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_max_f32_e32 v10, v17, v10 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_max_f32_e32 v9, v17, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_max_f32_e32 v9, v17, v9 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 @@ -26348,162 +25948,100 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v31, v32, v31 ; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_max_f32_e32 v14, v14, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v30, v32, v30 ; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_max_f32_e32 v13, v13, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v29, v32, v29 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_max_f32_e32 v12, v12, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v28, v32, v28 ; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_max_f32_e32 v11, v11, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v27, v32, v27 ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_max_f32_e32 v10, v10, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v26, v32, v26 ; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_max_f32_e32 v9, v9, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v25, v32, v25 ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_max_f32_e32 v8, v8, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v24, v32, v24 ; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_max_f32_e32 v7, v7, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v23, v32, v23 ; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_max_f32_e32 v6, v6, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v22, v32, v22 ; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_max_f32_e32 v5, v5, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v21, v32, v21 ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_max_f32_e32 v4, v4, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v20, v32, v20 ; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_max_f32_e32 v3, v3, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v19, v32, v19 ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_max_f32_e32 v2, v2, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v18, v32, v18 ; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_max_f32_e32 v1, v1, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v17, v32, v17 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v0, v0, v16 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GCN-NEXT: v_alignbit_b32 v0, v17, v0, 16 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 ; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 @@ -26534,8 +26072,6 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_max_f32_e32 v17, v17, v18 ; GCN-NEXT: v_max_f32_e32 v15, v15, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v17 @@ -26550,163 +26086,99 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_max_f32_e32 v31, v32, v31 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_max_f32_e32 v30, v32, v30 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_max_f32_e32 v29, v32, v29 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_max_f32_e32 v28, v32, v28 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_max_f32_e32 v28, v32, v28 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_max_f32_e32 v27, v27, v33 -; GFX7-NEXT: v_max_f32_e32 v15, v15, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v26 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_max_f32_e32 v32, v33, v32 +; GFX7-NEXT: v_max_f32_e32 v27, v32, v27 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_max_f32_e32 v26, v33, v26 +; GFX7-NEXT: v_max_f32_e32 v26, v32, v26 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_max_f32_e32 v25, v33, v25 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 +; GFX7-NEXT: v_max_f32_e32 v25, v32, v25 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_max_f32_e32 v32, v32, v33 +; GFX7-NEXT: v_max_f32_e32 v15, v15, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_max_f32_e32 v24, v33, v24 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_max_f32_e32 v23, v33, v23 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_max_f32_e32 v22, v33, v22 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_max_f32_e32 v21, v33, v21 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_max_f32_e32 v20, v33, v20 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v19, v33, v19 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_max_f32_e32 v18, v33, v18 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_max_f32_e32 v17, v33, v17 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_max_f32_e32 v17, v33, v17 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v17 ; GFX7-NEXT: v_alignbit_b32 v0, v16, v0, 16 @@ -26728,11 +26200,11 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-NEXT: v_alignbit_b32 v8, v16, v8, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v26 ; GFX7-NEXT: v_alignbit_b32 v9, v16, v9, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v27 ; GFX7-NEXT: v_alignbit_b32 v10, v16, v10, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v28 ; GFX7-NEXT: v_alignbit_b32 v11, v16, v11, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v32 ; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v31 ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v30 ; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v29 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 1b957444869e7..12de1a83cf2f3 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -33,18 +33,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -84,18 +82,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -111,18 +107,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -139,18 +133,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -206,24 +198,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_max_f32_e32 v2, v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -256,22 +245,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -282,23 +268,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -309,23 +293,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_max_f32_e32 v1, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -410,13 +392,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX942-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v6, v9, v9 -; GFX942-NEXT: v_max_f32_e32 v8, v6, v5 +; GFX942-NEXT: v_max_f32_e32 v8, v9, v5 ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_wbl2 sc1 @@ -529,13 +509,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9 -; GFX90A-NEXT: v_max_f32_e32 v8, v6, v5 +; GFX90A-NEXT: v_max_f32_e32 v8, v9, v5 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 @@ -580,22 +558,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f32_e32 v8, v5, v5 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v7, v7 -; GFX908-NEXT: v_max_f32_e32 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_max_f32_e32 v7, v8, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -607,21 +583,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB2_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -638,22 +614,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v5 ; GFX8-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX8-NEXT: v_max_f32_e32 v6, v5, v8 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_max_f32_e32 v7, v8, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -665,21 +639,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB2_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB2_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -757,18 +731,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -786,10 +758,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -797,9 +768,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -817,19 +786,17 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v3, s20 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX10-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -847,18 +814,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -874,18 +839,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -902,18 +865,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -930,18 +891,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s20 ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7-NEXT: v_mov_b32_e32 v3, s20 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -958,20 +917,18 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, s20 ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX6-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc @@ -1008,18 +965,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1059,18 +1014,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1086,18 +1039,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -1114,18 +1065,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -1174,27 +1123,24 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] -; GFX12-NEXT: v_mov_b32_e32 v8, s16 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: v_mov_b32_e32 v10, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1217,28 +1163,25 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v8, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: v_mov_b32_e32 v10, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1270,27 +1213,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v8, s20 +; GFX908-NEXT: v_mov_b32_e32 v10, s20 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: v_mov_b32_e32 v3, v5 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX908-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1300,27 +1243,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s20 +; GFX8-NEXT: v_mov_b32_e32 v10, s20 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, v5 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX8-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1359,24 +1302,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: v_mov_b32_e32 v6, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1400,25 +1341,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v6, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[0:1] +; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1451,26 +1390,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s20 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v10, v5 +; GFX908-NEXT: v_mov_b32_e32 v9, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1481,26 +1418,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s20 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v10, v5 +; GFX8-NEXT: v_mov_b32_e32 v9, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1560,17 +1495,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[5:6], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[13:14], v[5:6] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[5:6] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1660,17 +1593,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: v_max_f64 v[11:12], v[13:14], v[5:6] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1786,15 +1717,13 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: v_max_f64 v[11:12], v[13:14], v[5:6] ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: v_mov_b32_e32 v2, v13 @@ -1850,15 +1779,13 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: v_max_f64 v[11:12], v[13:14], v[5:6] ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: v_mov_b32_e32 v2, v13 @@ -1958,27 +1885,24 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] -; GFX12-NEXT: v_mov_b32_e32 v8, s16 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: v_mov_b32_e32 v10, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2001,28 +1925,25 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v8, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: v_mov_b32_e32 v10, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2034,28 +1955,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s20 -; GFX10-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v8, s20 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, s20 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX10-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: v_mov_b32_e32 v3, v5 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 @@ -2066,24 +1987,24 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX90A-NEXT: v_mov_b32_e32 v8, s20 +; GFX90A-NEXT: v_mov_b32_e32 v10, s20 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX90A-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2093,27 +2014,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v8, s20 +; GFX908-NEXT: v_mov_b32_e32 v10, s20 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: v_mov_b32_e32 v3, v5 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX908-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2123,27 +2044,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s20 +; GFX8-NEXT: v_mov_b32_e32 v10, s20 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, v5 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX8-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2153,27 +2074,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v8, s20 +; GFX7-NEXT: v_mov_b32_e32 v10, s20 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX7-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX7-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2183,28 +2104,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 -; GFX6-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX6-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX6-NEXT: v_mov_b32_e32 v9, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2224,27 +2146,24 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] -; GFX12-NEXT: v_mov_b32_e32 v8, s16 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: v_mov_b32_e32 v10, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2267,28 +2186,25 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v8, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: v_mov_b32_e32 v10, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2320,27 +2236,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v8, s20 +; GFX908-NEXT: v_mov_b32_e32 v10, s20 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: v_mov_b32_e32 v3, v5 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX908-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2350,27 +2266,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s20 +; GFX8-NEXT: v_mov_b32_e32 v10, s20 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, v5 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX8-NEXT: v_max_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2413,7 +2329,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -2433,13 +2348,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v0.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -2463,46 +2376,44 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2510,49 +2421,46 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 @@ -2564,12 +2472,10 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2590,83 +2496,78 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2674,34 +2575,32 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2709,35 +2608,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v5, s4 +; GFX908-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2745,36 +2642,34 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v3, s7, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2868,7 +2763,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -2888,13 +2782,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v0.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -2917,39 +2809,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -2963,31 +2853,29 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2998,13 +2886,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 @@ -3016,12 +2903,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3041,38 +2926,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3085,32 +2967,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -3123,29 +3003,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3157,30 +3035,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3192,31 +3068,29 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v4, s7, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3311,15 +3185,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3333,29 +3207,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v6.l, v6.l, v5.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3370,14 +3241,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -3385,7 +3256,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3395,15 +3266,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v11, v7 ; GFX12-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3417,30 +3288,27 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v10 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3455,14 +3323,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -3470,19 +3338,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX942-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX942-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX942-NEXT: v_not_b32_e32 v10, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v6, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v11, v6 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -3494,24 +3362,22 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX942-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v11 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX942-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 @@ -3526,36 +3392,36 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v8 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB12_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3567,29 +3433,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v6.l, v6.l, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3603,14 +3466,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3619,22 +3482,22 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v11, v7 ; GFX11-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3646,30 +3509,27 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v10 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3683,14 +3543,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3699,20 +3559,20 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX10-NEXT: v_not_b32_e32 v9, v6 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v11, v7 ; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -3722,26 +3582,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -3753,15 +3611,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -3770,19 +3628,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 -; GFX90A-NEXT: v_not_b32_e32 v10, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v11, v6 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3794,24 +3652,22 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v11 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3823,33 +3679,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v8 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX908-NEXT: v_not_b32_e32 v9, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v11, v6 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3861,25 +3717,23 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX908-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX908-NEXT: v_mov_b32_e32 v9, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3891,33 +3745,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX8-NEXT: v_not_b32_e32 v9, v4 +; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v11, v6 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3929,26 +3783,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3960,21 +3812,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4576,7 +4428,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 @@ -4617,7 +4468,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5089,7 +4939,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 @@ -5129,7 +4978,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5923,7 +5771,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 @@ -5994,7 +5841,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_max_f32_e32 v4, v4, v11 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 @@ -6049,11 +5895,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: v_mov_b32_e32 v3, s16 +; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6061,9 +5905,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v4, v5, v2 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6080,21 +5922,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6110,11 +5949,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: v_mov_b32_e32 v3, s16 +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6122,9 +5959,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -6142,19 +5977,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v3, s20 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX10-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -6172,18 +6005,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6199,18 +6030,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX908-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX908-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -6227,28 +6056,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -6355,23 +6180,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: v_mov_b32_e32 v3, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v1, v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v4, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6384,25 +6208,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v0 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6413,24 +6233,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v1, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v4, v1 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6443,24 +6262,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, s20 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v1, v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -6472,22 +6289,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6498,23 +6312,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_max_f16 v1, v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6525,27 +6337,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6658,24 +6466,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v7, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v7, v7 +; GFX12-NEXT: v_pk_max_num_f16 v7, v8, v5 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v6, v5, v8 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6690,14 +6496,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX12-NEXT: v_mov_b32_e32 v7, v5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -6705,7 +6511,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6728,17 +6534,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_pk_max_f16 v5, v5, v5 ; GFX942-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v6, v9, v9 +; GFX942-NEXT: v_pk_max_f16 v8, v9, v5 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_pk_max_f16 v8, v6, v5 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6784,24 +6588,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v7, v7 +; GFX11-NEXT: v_pk_max_f16 v7, v8, v5 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v6, v5, v8 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -6815,14 +6617,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v4, s[4:7], 0 offen offset:1024 glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -6831,7 +6633,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6848,23 +6650,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v7, v7 +; GFX10-NEXT: v_pk_max_f16 v7, v8, v5 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v6, v5, v8 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -6876,15 +6676,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -6893,7 +6693,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6916,13 +6716,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_pk_max_f16 v5, v5, v5 ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9 -; GFX90A-NEXT: v_pk_max_f16 v8, v6, v5 +; GFX90A-NEXT: v_pk_max_f16 v8, v9, v5 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 @@ -6967,22 +6765,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v7, v7 -; GFX908-NEXT: v_pk_max_f16 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_pk_max_f16 v7, v8, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -6994,21 +6790,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -7025,26 +6821,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 ; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v7, v7 -; GFX8-NEXT: v_max_f16_sdwa v5, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v9 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v5 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_max_f16_sdwa v6, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -7056,21 +6848,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -9164,18 +8956,16 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX942-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -9215,18 +9005,16 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9245,18 +9033,16 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -9273,18 +9059,16 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index da140ac4bf59c..1905e38d7af09 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -33,18 +33,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -84,18 +82,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -111,18 +107,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -139,18 +133,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -206,24 +198,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_min_f32_e32 v2, v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -256,22 +245,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc +; GFX90A-NEXT: v_min_f32_e32 v2, v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -282,23 +268,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_min_f32_e32 v1, v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -309,23 +293,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_min_f32_e32 v1, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -410,13 +392,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX942-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v6, v9, v9 -; GFX942-NEXT: v_min_f32_e32 v8, v6, v5 +; GFX942-NEXT: v_min_f32_e32 v8, v9, v5 ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_wbl2 sc1 @@ -529,13 +509,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9 -; GFX90A-NEXT: v_min_f32_e32 v8, v6, v5 +; GFX90A-NEXT: v_min_f32_e32 v8, v9, v5 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 @@ -580,22 +558,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f32_e32 v8, v5, v5 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v5, v7, v7 -; GFX908-NEXT: v_min_f32_e32 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_min_f32_e32 v7, v8, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -607,21 +583,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB2_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -638,22 +614,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v5 ; GFX8-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX8-NEXT: v_min_f32_e32 v6, v5, v8 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_min_f32_e32 v7, v8, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -665,21 +639,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB2_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB2_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -757,18 +731,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -786,10 +758,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -797,9 +768,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -817,19 +786,17 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v3, s20 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX10-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -847,18 +814,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -874,18 +839,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -902,18 +865,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -930,18 +891,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s20 ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7-NEXT: v_mov_b32_e32 v3, s20 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -958,20 +917,18 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, s20 ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX6-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc @@ -1008,18 +965,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1059,18 +1014,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1086,18 +1039,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -1114,18 +1065,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -1174,27 +1123,24 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] -; GFX12-NEXT: v_mov_b32_e32 v8, s16 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: v_mov_b32_e32 v10, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1217,28 +1163,25 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v8, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: v_mov_b32_e32 v10, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1270,27 +1213,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v8, s20 +; GFX908-NEXT: v_mov_b32_e32 v10, s20 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: v_mov_b32_e32 v3, v5 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX908-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1300,27 +1243,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s20 +; GFX8-NEXT: v_mov_b32_e32 v10, s20 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, v5 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX8-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1359,24 +1302,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: v_mov_b32_e32 v6, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1400,25 +1341,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v6, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_min_f64 v[2:3], v[4:5], v[0:1] +; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1451,26 +1390,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s20 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v10, v5 +; GFX908-NEXT: v_mov_b32_e32 v9, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1481,26 +1418,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s20 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_min_f64 v[2:3], v[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v10, v5 +; GFX8-NEXT: v_mov_b32_e32 v9, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1560,17 +1495,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[5:6], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[13:14], v[5:6] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[5:6] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1660,17 +1593,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: v_min_f64 v[11:12], v[13:14], v[5:6] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1786,15 +1717,13 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: v_min_f64 v[11:12], v[13:14], v[5:6] ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: v_mov_b32_e32 v2, v13 @@ -1850,15 +1779,13 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: v_min_f64 v[11:12], v[13:14], v[5:6] ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: v_mov_b32_e32 v2, v13 @@ -1958,27 +1885,24 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] -; GFX12-NEXT: v_mov_b32_e32 v8, s16 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: v_mov_b32_e32 v10, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2001,28 +1925,25 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v8, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: v_mov_b32_e32 v10, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2034,28 +1955,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s20 -; GFX10-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v8, s20 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, s20 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX10-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: v_mov_b32_e32 v3, v5 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 @@ -2066,24 +1987,24 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX90A-NEXT: v_mov_b32_e32 v8, s20 +; GFX90A-NEXT: v_mov_b32_e32 v10, s20 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX90A-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2093,27 +2014,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v8, s20 +; GFX908-NEXT: v_mov_b32_e32 v10, s20 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: v_mov_b32_e32 v3, v5 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX908-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2123,27 +2044,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s20 +; GFX8-NEXT: v_mov_b32_e32 v10, s20 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, v5 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX8-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2153,27 +2074,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v8, s20 +; GFX7-NEXT: v_mov_b32_e32 v10, s20 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX7-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX7-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2183,28 +2104,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 -; GFX6-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX6-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX6-NEXT: v_mov_b32_e32 v9, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2224,27 +2146,24 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] -; GFX12-NEXT: v_mov_b32_e32 v8, s16 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: v_mov_b32_e32 v10, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2267,28 +2186,25 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v8, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: v_mov_b32_e32 v10, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: buffer_load_b64 v[8:9], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX11-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2320,27 +2236,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v8, s20 +; GFX908-NEXT: v_mov_b32_e32 v10, s20 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: v_mov_b32_e32 v3, v5 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX908-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2350,27 +2266,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s20 +; GFX8-NEXT: v_mov_b32_e32 v10, s20 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, v5 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc +; GFX8-NEXT: v_min_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2413,7 +2329,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -2433,13 +2348,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v1.l, v0.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -2463,46 +2376,44 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2510,49 +2421,46 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 @@ -2564,12 +2472,10 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2590,83 +2496,78 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2674,34 +2575,32 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2709,35 +2608,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v5, s4 +; GFX908-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2745,36 +2642,34 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v3, s7, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2868,7 +2763,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -2888,13 +2782,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v1.l, v0.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -2917,39 +2809,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -2963,31 +2853,29 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2998,13 +2886,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 @@ -3016,12 +2903,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3041,38 +2926,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3085,32 +2967,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -3123,29 +3003,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3157,30 +3035,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3192,31 +3068,29 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v4, s7, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3311,15 +3185,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3333,29 +3207,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v6.l, v6.l, v5.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3370,14 +3241,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -3385,7 +3256,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3395,15 +3266,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v11, v7 ; GFX12-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3417,30 +3288,27 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, v4, v10 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3455,14 +3323,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -3470,19 +3338,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX942-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX942-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX942-NEXT: v_not_b32_e32 v10, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v6, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v11, v6 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -3494,24 +3362,22 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX942-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_min_f16_e32 v4, v4, v11 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX942-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX942-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 @@ -3526,36 +3392,36 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v8 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB12_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3567,29 +3433,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v6.l, v6.l, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3603,14 +3466,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3619,22 +3482,22 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v11, v7 ; GFX11-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3646,30 +3509,27 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, v4, v10 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3683,14 +3543,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3699,20 +3559,20 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX10-NEXT: v_not_b32_e32 v9, v6 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v11, v7 ; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -3722,26 +3582,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -3753,15 +3611,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -3770,19 +3628,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 -; GFX90A-NEXT: v_not_b32_e32 v10, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v11, v6 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3794,24 +3652,22 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v11 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX90A-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3823,33 +3679,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v8 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX908-NEXT: v_not_b32_e32 v9, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v11, v6 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3861,25 +3717,23 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX908-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX908-NEXT: v_mov_b32_e32 v9, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3891,33 +3745,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX8-NEXT: v_not_b32_e32 v9, v4 +; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v11, v6 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3929,26 +3783,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3960,21 +3812,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4576,7 +4428,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 @@ -4617,7 +4468,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5089,7 +4939,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 @@ -5129,7 +4978,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5923,7 +5771,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 @@ -5994,7 +5841,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_min_f32_e32 v4, v4, v11 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 @@ -6049,11 +5895,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: v_mov_b32_e32 v3, s16 +; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6061,9 +5905,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v4, v5, v2 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6080,21 +5922,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6110,11 +5949,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: v_mov_b32_e32 v3, s16 +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6122,9 +5959,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -6142,19 +5977,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v3, s20 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX10-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -6172,18 +6005,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6199,18 +6030,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX908-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX908-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -6227,28 +6056,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_min_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -6355,23 +6180,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: v_mov_b32_e32 v3, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_pk_min_num_f16 v1, v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v4, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6384,25 +6208,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX942-NEXT: v_pk_min_f16 v2, v3, v0 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen offset:1024 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6413,24 +6233,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX11-NEXT: v_pk_min_f16 v1, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v4, v1 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6443,24 +6262,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, s20 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX10-NEXT: v_pk_min_f16 v1, v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -6472,22 +6289,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc +; GFX90A-NEXT: v_pk_min_f16 v2, v3, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6498,23 +6312,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_min_f16 v1, v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6525,27 +6337,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc +; GFX8-NEXT: v_min_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6658,24 +6466,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v7, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v5, v7, v7 +; GFX12-NEXT: v_pk_min_num_f16 v7, v8, v5 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v6, v5, v8 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6690,14 +6496,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX12-NEXT: v_mov_b32_e32 v7, v5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -6705,7 +6511,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6728,17 +6534,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_pk_max_f16 v5, v5, v5 ; GFX942-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v6, v9, v9 +; GFX942-NEXT: v_pk_min_f16 v8, v9, v5 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_pk_min_f16 v8, v6, v5 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6784,24 +6588,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v5, v7, v7 +; GFX11-NEXT: v_pk_min_f16 v7, v8, v5 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v6, v5, v8 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -6815,14 +6617,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v4, s[4:7], 0 offen offset:1024 glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -6831,7 +6633,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6848,23 +6650,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v5, v7, v7 +; GFX10-NEXT: v_pk_min_f16 v7, v8, v5 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_f16 v6, v5, v8 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -6876,15 +6676,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -6893,7 +6693,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6916,13 +6716,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_pk_max_f16 v5, v5, v5 ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9 -; GFX90A-NEXT: v_pk_min_f16 v8, v6, v5 +; GFX90A-NEXT: v_pk_min_f16 v8, v9, v5 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 @@ -6967,22 +6765,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v5, v7, v7 -; GFX908-NEXT: v_pk_min_f16 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_pk_min_f16 v7, v8, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -6994,21 +6790,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -7025,26 +6821,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 ; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v5, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v7, v7 -; GFX8-NEXT: v_min_f16_sdwa v5, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v9 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v5 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_min_f16_sdwa v6, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v7, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -7056,21 +6848,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -9164,18 +8956,16 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX942-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -9215,18 +9005,16 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -9245,18 +9033,16 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s20 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc @@ -9273,18 +9059,16 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index ffd52c2704409..4b67f2c3e7be5 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -359,7 +359,6 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_max_f32_e32 v2, 0x80000000, v2 ; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -374,12 +373,11 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, 0x80000000, v2 +; GFX8-NEXT: v_max_f32_e32 v2, 0x80000000, v3 ; GFX8-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -391,7 +389,6 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -401,12 +398,11 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -415,12 +411,11 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -449,7 +444,6 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 ; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v2 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 @@ -466,12 +460,11 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX8-NEXT: v_max_f32_e32 v2, 0, v3 ; GFX8-NEXT: v_min_f32_e32 v3, 1.0, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -485,7 +478,6 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0, v1 ; GFX9-NEXT: v_min_f32_e32 v2, 1.0, v1 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] @@ -502,9 +494,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc @@ -520,9 +510,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1 ; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -2226,8 +2214,8 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -2244,8 +2232,8 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_med3_f32 v2, v2, 0, 1.0 +; GFX8-NEXT: v_max_f32_e32 v2, 0, v3 +; GFX8-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2256,8 +2244,8 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 +; GFX9-NEXT: v_max_f32_e32 v1, 0, v1 +; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2265,13 +2253,12 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 +; GFX11-NEXT: v_maxmin_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -3110,22 +3097,21 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v2, 0, v2 -; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, s0, v3 ; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -3221,10 +3207,9 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v2, 2.0, v2 -; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, 2.0, v3 ; GFX8-NEXT: v_min_f16_e32 v2, 1.0, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -3236,7 +3221,6 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 2.0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -3251,9 +3235,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -3267,9 +3249,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 2.0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -3328,10 +3308,9 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v2, 0, v2 -; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, 0, v3 ; GFX8-NEXT: v_min_f16_e32 v2, 0, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -3343,7 +3322,6 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -3358,9 +3336,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: v_pk_max_f16 v1, v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -3374,9 +3350,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -3927,22 +3901,21 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v2, 0x7e00, v2 +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v3, 0, v3 -; GFX8-NEXT: v_min_f16_e32 v3, 0x7e00, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v3, s0, v3 +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -4029,22 +4002,21 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v2, 0, v2 -; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3 +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, s0, v3 ; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll index 42245e3d7013d..5da081681c6af 100644 --- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll +++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll @@ -2076,8 +2076,6 @@ define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C) ; GCN-LABEL: test109: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GCN-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 ; GCN-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3 ; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 ; GCN-NEXT: v_cmp_gt_f32_e64 s0, v1, v4 @@ -2124,10 +2122,9 @@ define i1 @test111(float %arg1, float %arg2, float %arg3, float %arg4, float %ar ; GCN-LABEL: test111: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 -; GCN-NEXT: v_dual_min_f32 v2, v2, v3 :: v_dual_max_f32 v3, v4, v4 +; GCN-NEXT: v_min_f32_e32 v2, v2, v3 ; GCN-NEXT: v_min3_f32 v0, v0, v1, v2 -; GCN-NEXT: v_min_f32_e32 v0, v0, v3 +; GCN-NEXT: v_min_f32_e32 v0, v0, v4 ; GCN-NEXT: v_min3_f32 v0, v5, v6, v0 ; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo @@ -2154,12 +2151,10 @@ define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %ar ; GCN-LABEL: test112: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 +; GCN-NEXT: v_min_f32_e32 v2, v2, v3 ; GCN-NEXT: v_cmp_nge_f32_e32 vcc_lo, v4, v8 -; GCN-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_min_f32 v2, v2, v3 -; GCN-NEXT: v_max_f32_e32 v3, v6, v6 ; GCN-NEXT: v_min3_f32 v0, v0, v1, v2 -; GCN-NEXT: v_min3_f32 v0, v0, v5, v3 +; GCN-NEXT: v_min3_f32 v0, v0, v5, v6 ; GCN-NEXT: v_cmp_lt_f32_e64 s0, v0, v8 ; GCN-NEXT: s_or_b32 s0, s0, vcc_lo ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -2186,9 +2181,8 @@ define i1 @test113(float %arg1, float %arg2, float %arg3, float %C) { ; GCN-LABEL: test113: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 +; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 ; GCN-NEXT: v_cmp_nge_f32_e64 s0, v0, v3 ; GCN-NEXT: s_or_b32 s0, s0, vcc_lo ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -2205,9 +2199,8 @@ define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) { ; GCN-LABEL: test114: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GCN-NEXT: v_cmp_nge_f32_e32 vcc_lo, v2, v3 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 +; GCN-NEXT: v_cmp_nge_f32_e32 vcc_lo, v2, v3 ; GCN-NEXT: v_cmp_gt_f32_e64 s0, v0, v3 ; GCN-NEXT: s_and_b32 s0, s0, vcc_lo ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -2224,8 +2217,7 @@ define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C) ; GCN-LABEL: test115: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GCN-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v3, v3, v3 +; GCN-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 ; GCN-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3 ; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 ; GCN-NEXT: v_cmp_nge_f32_e64 s0, v1, v4 @@ -2248,11 +2240,6 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar ; GCN-LABEL: test116: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8 -; GCN-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GCN-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 -; GCN-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v4, v4, v4 -; GCN-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v6, v6, v6 ; GCN-NEXT: v_min_f32_e32 v8, v8, v9 ; GCN-NEXT: v_dual_max_f32 v2, v2, v3 :: v_dual_min_f32 v3, v4, v5 ; GCN-NEXT: v_max_f32_e32 v4, v6, v7 @@ -2292,10 +2279,6 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar ; GCN-LABEL: test117: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v6, v6, v6 -; GCN-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v10, v10, v10 -; GCN-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GCN-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v2, v2, v2 ; GCN-NEXT: v_min_f32_e32 v6, v6, v7 ; GCN-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_min_f32 v1, v10, v11 ; GCN-NEXT: v_min_f32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll index f6886d7499d73..9815eecb953df 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs-known-signbit-combine-fast-fdiv-lowering.ll @@ -68,9 +68,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_maxnum_fabs(float %x, float % ; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known_zero_maxnum_fabs: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_max_f32_e64 v2, |v2|, |v2| -; CHECK-NEXT: v_max_f32_e64 v1, |v1|, |v1| -; CHECK-NEXT: v_max_f32_e32 v1, v1, v2 +; CHECK-NEXT: v_max_f32_e64 v1, |v1|, |v2| ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 @@ -92,9 +90,7 @@ define float @fdiv_fast_daz_rhs_signbit_known_zero_minnum_fabs(float %x, float % ; CHECK-LABEL: fdiv_fast_daz_rhs_signbit_known_zero_minnum_fabs: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_max_f32_e64 v2, |v2|, |v2| -; CHECK-NEXT: v_max_f32_e64 v1, |v1|, |v1| -; CHECK-NEXT: v_min_f32_e32 v1, v1, v2 +; CHECK-NEXT: v_min_f32_e64 v1, |v1|, |v2| ; CHECK-NEXT: s_mov_b32 s4, 0x6f800000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x2f800000 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index ddc889f8075d5..c6ffbf3581ae1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -31,27 +31,25 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -77,25 +75,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -103,13 +99,11 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -127,13 +121,11 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -174,27 +166,25 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -222,25 +212,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -248,13 +236,11 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -274,17 +260,15 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 @@ -327,24 +311,22 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_max_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -386,12 +368,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -414,17 +394,15 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 @@ -439,17 +417,15 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB2_1 @@ -488,22 +464,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -535,20 +509,18 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -558,20 +530,18 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -581,20 +551,18 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -629,22 +597,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -678,20 +644,18 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -701,20 +665,18 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -726,20 +688,18 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -781,24 +741,22 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -842,12 +800,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -870,12 +826,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -893,20 +847,18 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -945,27 +897,25 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -993,28 +943,26 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1022,13 +970,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1048,17 +994,15 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 @@ -1098,22 +1042,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1147,23 +1089,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1173,20 +1113,18 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1198,20 +1136,18 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1249,42 +1185,38 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1304,14 +1236,12 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1329,25 +1259,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1355,13 +1283,11 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1379,13 +1305,11 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1403,13 +1327,11 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -1442,27 +1364,25 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1488,25 +1408,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1514,13 +1432,11 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1538,13 +1454,11 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1589,27 +1503,25 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1635,25 +1547,23 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1661,13 +1571,11 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1685,13 +1593,11 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1732,27 +1638,25 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1780,25 +1684,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1806,13 +1708,11 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1832,17 +1732,15 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 @@ -1885,24 +1783,22 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_max_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1944,12 +1840,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1972,17 +1866,15 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -1997,17 +1889,15 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -2046,22 +1936,20 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2093,20 +1981,18 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2116,20 +2002,18 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2139,20 +2023,18 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2187,22 +2069,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2236,20 +2116,18 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2259,20 +2137,18 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2284,20 +2160,18 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2339,24 +2213,22 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2400,12 +2272,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2428,12 +2298,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2451,20 +2319,18 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2503,27 +2369,25 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2551,28 +2415,26 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2580,13 +2442,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2606,17 +2466,15 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -2656,22 +2514,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2705,23 +2561,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2731,20 +2585,18 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2756,20 +2608,18 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2802,29 +2652,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -2833,7 +2681,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -2842,16 +2690,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -2883,14 +2729,12 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_cbranch_execz .LBB18_2 ; GFX942-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX942-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX942-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -2898,29 +2742,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -2928,22 +2770,20 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB18_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -2976,16 +2816,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: s_cbranch_execz .LBB18_2 ; GFX10-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3017,15 +2855,13 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_cbranch_execz .LBB18_2 ; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3033,90 +2869,89 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB18_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_6 +; GFX908-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[8:9], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB18_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB18_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB18_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB18_2 +; GFX908-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi +; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB18_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_6 @@ -3124,17 +2959,16 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3167,16 +3001,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_cbranch_execz .LBB18_2 ; GFX7-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3192,7 +3024,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo @@ -3218,9 +3049,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3240,13 +3070,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3280,14 +3108,12 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB19_2 ; GFX942-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX942-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX942-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3295,7 +3121,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo @@ -3318,9 +3143,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3339,13 +3163,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB19_2 ; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3379,16 +3201,14 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_cbranch_execz .LBB19_2 ; GFX10-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3420,15 +3240,13 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_cbranch_execz .LBB19_2 ; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3436,7 +3254,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc @@ -3459,8 +3276,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_max_f64 v[6:7], v[8:9], v[2:3] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3476,14 +3292,13 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB19_2 ; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3491,7 +3306,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 @@ -3519,8 +3333,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[8:9], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3536,15 +3349,14 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB19_2 ; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3578,16 +3390,14 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_cbranch_execz .LBB19_2 ; GFX7-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3604,7 +3414,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo @@ -3630,9 +3439,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3652,13 +3460,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3693,14 +3499,12 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB20_2 ; GFX942-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX942-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX942-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3708,7 +3512,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo @@ -3731,9 +3534,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3752,13 +3554,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3792,16 +3592,14 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_cbranch_execz .LBB20_2 ; GFX10-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3833,15 +3631,13 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: s_cbranch_execz .LBB20_2 ; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3849,7 +3645,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc @@ -3872,8 +3667,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_max_f64 v[6:7], v[8:9], v[2:3] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3889,14 +3683,13 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB20_2 ; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3904,7 +3697,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -3932,8 +3724,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[8:9], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3949,15 +3740,14 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3991,16 +3781,14 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_cbranch_execz .LBB20_2 ; GFX7-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4017,7 +3805,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -4033,20 +3820,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -4055,19 +3840,17 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4098,12 +3881,10 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_cbranch_execz .LBB21_2 ; GFX942-NEXT: .LBB21_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] @@ -4113,7 +3894,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 @@ -4126,21 +3906,19 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -4148,18 +3926,16 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB21_2 ; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4191,13 +3967,11 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: s_cbranch_execz .LBB21_2 ; GFX10-NEXT: .LBB21_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -4232,9 +4006,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -4245,7 +4017,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4258,38 +4029,36 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB21_2 ; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4297,7 +4066,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4312,42 +4080,40 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[2:3] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB21_2 ; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4378,13 +4144,11 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_cbranch_execz .LBB21_2 ; GFX7-NEXT: .LBB21_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen @@ -4403,14 +4167,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4422,20 +4185,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -4443,20 +4204,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4489,12 +4248,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB22_2 ; GFX942-NEXT: .LBB22_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] @@ -4504,13 +4261,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4520,40 +4276,36 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB22_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4587,13 +4339,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_cbranch_execz .LBB22_2 ; GFX10-NEXT: .LBB22_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -4630,9 +4380,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -4643,11 +4391,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_3 @@ -4658,38 +4405,36 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB22_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB22_2 ; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4697,13 +4442,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_3 @@ -4714,42 +4458,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB22_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4782,13 +4524,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_cbranch_execz .LBB22_2 ; GFX7-NEXT: .LBB22_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen @@ -4808,14 +4548,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4827,20 +4566,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -4848,20 +4585,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4895,12 +4630,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB23_2 ; GFX942-NEXT: .LBB23_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] @@ -4910,13 +4643,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB23_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4926,40 +4658,36 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB23_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB23_2 ; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4993,13 +4721,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_cbranch_execz .LBB23_2 ; GFX10-NEXT: .LBB23_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -5036,9 +4762,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -5049,11 +4773,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_3 @@ -5064,38 +4787,36 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB23_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB23_2 ; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -5103,13 +4824,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_3 @@ -5120,42 +4840,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB23_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB23_2 ; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5188,13 +4906,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_cbranch_execz .LBB23_2 ; GFX7-NEXT: .LBB23_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen @@ -5214,29 +4930,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -5245,7 +4959,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5254,16 +4968,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -5295,14 +5007,12 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_cbranch_execz .LBB24_2 ; GFX942-NEXT: .LBB24_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX942-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX942-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -5310,29 +5020,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5340,57 +5048,53 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB24_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_4 ; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_2 ; GFX10-NEXT: ; %bb.3: ; %Flow ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: .LBB24_4: ; %Flow2 ; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_6 @@ -5398,18 +5102,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -5417,138 +5120,134 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB24_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB24_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB24_6 +; GFX908-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB24_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[8:9], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB24_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB24_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB24_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB24_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB24_2 +; GFX908-NEXT: .LBB24_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi +; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB24_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB24_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_6 @@ -5556,17 +5255,16 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5575,37 +5273,35 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_4 ; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[2:3] -; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[4:5] +; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB24_2 ; GFX7-NEXT: ; %bb.3: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX7-NEXT: .LBB24_4: ; %Flow2 ; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_6 @@ -5613,17 +5309,16 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -5638,29 +5333,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -5669,7 +5362,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5678,16 +5371,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -5719,14 +5410,12 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX942-NEXT: s_cbranch_execz .LBB25_2 ; GFX942-NEXT: .LBB25_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX942-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX942-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -5734,29 +5423,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5764,22 +5451,20 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB25_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -5812,16 +5497,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX10-NEXT: s_cbranch_execz .LBB25_2 ; GFX10-NEXT: .LBB25_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5853,15 +5536,13 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX90A-NEXT: s_cbranch_execz .LBB25_2 ; GFX90A-NEXT: .LBB25_4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5869,90 +5550,89 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB25_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB25_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB25_6 +; GFX908-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[8:9], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB25_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB25_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB25_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB25_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB25_2 +; GFX908-NEXT: .LBB25_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi +; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB25_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB25_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_6 @@ -5960,17 +5640,16 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6003,16 +5682,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX7-NEXT: s_cbranch_execz .LBB25_2 ; GFX7-NEXT: .LBB25_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: v_max_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -6033,9 +5710,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6050,11 +5726,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6079,9 +5753,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6095,12 +5768,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6129,12 +5801,10 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -6157,9 +5827,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6174,11 +5843,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6199,9 +5866,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6215,12 +5881,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6241,7 +5906,6 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6254,7 +5918,6 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -6284,12 +5947,10 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -6311,33 +5972,31 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v5, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -6352,13 +6011,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6421,35 +6078,30 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6460,7 +6112,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6473,15 +6125,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6489,12 +6139,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6518,69 +6167,62 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6591,19 +6233,18 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6615,12 +6256,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6642,10 +6282,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6655,7 +6294,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -6676,36 +6314,34 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6714,33 +6350,31 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v5, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6756,13 +6390,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6827,35 +6459,30 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6866,7 +6493,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6879,15 +6506,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6895,12 +6520,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6925,69 +6549,62 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6998,19 +6615,18 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7022,12 +6638,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -7049,10 +6664,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7062,7 +6676,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -7083,36 +6696,34 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7121,33 +6732,31 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v5, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7163,13 +6772,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7235,9 +6842,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7251,10 +6857,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7279,9 +6884,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7294,12 +6898,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7328,12 +6930,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -7355,9 +6955,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7371,10 +6970,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7395,9 +6993,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7410,12 +7007,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7436,7 +7031,6 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -7448,7 +7042,6 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -7478,12 +7071,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -7511,12 +7102,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -7544,12 +7133,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7611,37 +7198,33 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -7658,37 +7241,33 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -7705,30 +7284,28 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7738,38 +7315,34 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7781,37 +7354,34 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7823,31 +7393,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -7858,31 +7426,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7892,31 +7458,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7926,32 +7490,30 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8005,37 +7567,33 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -8052,37 +7610,33 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -8100,30 +7654,28 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB31_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8133,38 +7685,34 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8176,37 +7724,34 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8218,31 +7763,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -8253,31 +7796,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8287,31 +7828,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8321,32 +7860,30 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8401,7 +7938,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8409,9 +7945,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8436,17 +7970,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8466,36 +7998,33 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_max_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8503,9 +8032,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8526,17 +8053,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8558,23 +8083,21 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v1, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -8585,27 +8108,25 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8613,14 +8134,12 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v4, v2 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8641,19 +8160,17 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -8702,15 +8219,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8734,24 +8249,21 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -8764,24 +8276,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_max_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8792,15 +8302,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8820,25 +8328,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8852,23 +8357,21 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -8879,22 +8382,20 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8904,22 +8405,20 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v3, v4, v2 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8931,22 +8430,20 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_e32 v3, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8993,36 +8490,31 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9033,7 +8525,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9046,15 +8538,13 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9062,12 +8552,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -9092,69 +8581,62 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB34_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -9165,19 +8647,18 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9189,12 +8670,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -9216,10 +8696,9 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -9229,7 +8708,6 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -9250,39 +8728,37 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9291,33 +8767,31 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v5, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9333,13 +8807,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -9404,38 +8876,34 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -9452,38 +8920,34 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -9500,30 +8964,28 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB35_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9533,38 +8995,34 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9576,37 +9034,34 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9618,31 +9073,29 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -9653,34 +9106,32 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9690,31 +9141,29 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9724,32 +9173,30 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10230,7 +9677,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_mov_b32_e32 v6, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 @@ -10696,7 +10142,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_mov_b32_e32 v6, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 @@ -11164,7 +10609,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_mov_b32_e32 v6, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 @@ -11603,7 +11047,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 @@ -12055,7 +11498,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -12509,7 +11951,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -12884,7 +12325,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -13245,7 +12685,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -13716,7 +13155,6 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_mov_b32_e32 v6, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 @@ -14174,7 +13612,6 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -14209,15 +13646,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14236,43 +13671,38 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB46_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14292,14 +13722,12 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14317,25 +13745,23 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -14343,13 +13769,11 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14367,21 +13791,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -14440,15 +13860,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14467,43 +13885,38 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB47_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14524,21 +13937,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB47_1 @@ -14549,25 +13960,23 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14575,13 +13984,11 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14601,21 +14008,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -14676,15 +14079,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14707,25 +14108,22 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v0, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX942-NEXT: v_pk_max_f16 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14738,22 +14136,20 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX11-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v1, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14767,21 +14163,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB48_1 @@ -14799,12 +14193,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -14827,17 +14219,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX908-NEXT: v_pk_max_f16 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_pk_max_f16 v0, v1, v2 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB48_1 @@ -14852,21 +14242,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -14926,21 +14312,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -14953,23 +14336,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB49_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14979,22 +14359,19 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15006,21 +14383,19 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -15031,20 +14406,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15054,20 +14427,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15077,24 +14448,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15147,21 +14514,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -15174,23 +14538,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15200,22 +14561,19 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15229,21 +14587,19 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -15254,20 +14610,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15277,20 +14631,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15302,24 +14654,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15375,21 +14723,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -15406,25 +14751,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15437,22 +14779,19 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15466,21 +14805,19 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -15498,12 +14835,10 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -15526,12 +14861,10 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX908-NEXT: v_pk_max_f16 v0, v1, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15549,24 +14882,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15623,15 +14952,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -15651,43 +14978,38 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX942-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15708,21 +15030,19 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -15733,28 +15053,26 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15762,13 +15080,11 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15788,21 +15104,17 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -15862,22 +15174,19 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -15890,23 +15199,20 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB53_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15916,22 +15222,19 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15945,21 +15248,19 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -15970,23 +15271,21 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15996,20 +15295,18 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16021,24 +15318,20 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 1b3fd173ab7b5..409bdfa5232c0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -31,27 +31,25 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -77,25 +75,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -103,13 +99,11 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -127,13 +121,11 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -174,27 +166,25 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -222,25 +212,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -248,13 +236,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -274,17 +260,15 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 @@ -327,24 +311,22 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_min_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -386,12 +368,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -414,17 +394,15 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 @@ -439,17 +417,15 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB2_1 @@ -488,22 +464,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -535,20 +509,18 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -558,20 +530,18 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -581,20 +551,18 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -629,22 +597,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -678,20 +644,18 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -701,20 +665,18 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -726,20 +688,18 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -781,24 +741,22 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -842,12 +800,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -870,12 +826,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -893,20 +847,18 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -945,27 +897,25 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -993,28 +943,26 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1022,13 +970,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1048,17 +994,15 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 @@ -1098,22 +1042,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1147,23 +1089,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1173,20 +1113,18 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1198,20 +1136,18 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1249,42 +1185,38 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1304,14 +1236,12 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1329,25 +1259,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1355,13 +1283,11 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1379,13 +1305,11 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1403,13 +1327,11 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -1442,27 +1364,25 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1488,25 +1408,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1514,13 +1432,11 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1538,13 +1454,11 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1589,27 +1503,25 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1635,25 +1547,23 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1661,13 +1571,11 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1685,13 +1593,11 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1732,27 +1638,25 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1780,25 +1684,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1806,13 +1708,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1832,17 +1732,15 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 @@ -1885,24 +1783,22 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_min_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1944,12 +1840,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1972,17 +1866,15 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -1997,17 +1889,15 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -2046,22 +1936,20 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2093,20 +1981,18 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2116,20 +2002,18 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2139,20 +2023,18 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2187,22 +2069,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2236,20 +2116,18 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2259,20 +2137,18 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2284,20 +2160,18 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2339,24 +2213,22 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2400,12 +2272,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2428,12 +2298,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2451,20 +2319,18 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2503,27 +2369,25 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2551,28 +2415,26 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2580,13 +2442,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2606,17 +2466,15 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -2656,22 +2514,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2705,23 +2561,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2731,20 +2585,18 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2756,20 +2608,18 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2802,29 +2652,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -2833,7 +2681,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -2842,16 +2690,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -2883,14 +2729,12 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_cbranch_execz .LBB18_2 ; GFX942-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX942-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX942-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -2898,29 +2742,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -2928,22 +2770,20 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB18_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -2976,16 +2816,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: s_cbranch_execz .LBB18_2 ; GFX10-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3017,15 +2855,13 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_cbranch_execz .LBB18_2 ; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3033,90 +2869,89 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB18_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_6 +; GFX908-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_min_f64 v[6:7], v[8:9], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB18_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB18_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB18_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB18_2 +; GFX908-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi +; GFX908-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB18_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_6 @@ -3124,17 +2959,16 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3167,16 +3001,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_cbranch_execz .LBB18_2 ; GFX7-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3192,7 +3024,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo @@ -3218,9 +3049,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3240,13 +3070,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3280,14 +3108,12 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB19_2 ; GFX942-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX942-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX942-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3295,7 +3121,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo @@ -3318,9 +3143,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[6:7], v[8:9], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3339,13 +3163,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB19_2 ; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3379,16 +3201,14 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_cbranch_execz .LBB19_2 ; GFX10-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3420,15 +3240,13 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_cbranch_execz .LBB19_2 ; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3436,7 +3254,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc @@ -3459,8 +3276,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_min_f64 v[6:7], v[8:9], v[2:3] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3476,14 +3292,13 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB19_2 ; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3491,7 +3306,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 @@ -3519,8 +3333,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[8:9], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3536,15 +3349,14 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB19_2 ; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3578,16 +3390,14 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_cbranch_execz .LBB19_2 ; GFX7-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3604,7 +3414,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo @@ -3630,9 +3439,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3652,13 +3460,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3693,14 +3499,12 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB20_2 ; GFX942-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX942-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX942-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3708,7 +3512,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo @@ -3731,9 +3534,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[6:7], v[8:9], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3752,13 +3554,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3792,16 +3592,14 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_cbranch_execz .LBB20_2 ; GFX10-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3833,15 +3631,13 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: s_cbranch_execz .LBB20_2 ; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3849,7 +3645,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc @@ -3872,8 +3667,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_min_f64 v[6:7], v[8:9], v[2:3] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3889,14 +3683,13 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB20_2 ; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -3904,7 +3697,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -3932,8 +3724,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[8:9], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3949,15 +3740,14 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3991,16 +3781,14 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_cbranch_execz .LBB20_2 ; GFX7-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4017,7 +3805,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -4033,20 +3820,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -4055,19 +3840,17 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4098,12 +3881,10 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_cbranch_execz .LBB21_2 ; GFX942-NEXT: .LBB21_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX942-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] @@ -4113,7 +3894,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 @@ -4126,21 +3906,19 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -4148,18 +3926,16 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB21_2 ; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4191,13 +3967,11 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: s_cbranch_execz .LBB21_2 ; GFX10-NEXT: .LBB21_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -4232,9 +4006,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -4245,7 +4017,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4258,38 +4029,36 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX908-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB21_2 ; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4297,7 +4066,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4312,42 +4080,40 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[2:3] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB21_2 ; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4378,13 +4144,11 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_cbranch_execz .LBB21_2 ; GFX7-NEXT: .LBB21_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen @@ -4403,14 +4167,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4422,20 +4185,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -4443,20 +4204,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4489,12 +4248,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB22_2 ; GFX942-NEXT: .LBB22_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX942-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] @@ -4504,13 +4261,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4520,40 +4276,36 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB22_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4587,13 +4339,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_cbranch_execz .LBB22_2 ; GFX10-NEXT: .LBB22_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -4630,9 +4380,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -4643,11 +4391,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_3 @@ -4658,38 +4405,36 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB22_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB22_2 ; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4697,13 +4442,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_3 @@ -4714,42 +4458,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB22_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4782,13 +4524,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_cbranch_execz .LBB22_2 ; GFX7-NEXT: .LBB22_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen @@ -4808,14 +4548,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4827,20 +4566,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -4848,20 +4585,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4895,12 +4630,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_cbranch_execz .LBB23_2 ; GFX942-NEXT: .LBB23_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX942-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] @@ -4910,13 +4643,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB23_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4926,40 +4658,36 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB23_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB23_2 ; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4993,13 +4721,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_cbranch_execz .LBB23_2 ; GFX10-NEXT: .LBB23_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -5036,9 +4762,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 @@ -5049,11 +4773,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_3 @@ -5064,38 +4787,36 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB23_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB23_2 ; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -5103,13 +4824,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_3 @@ -5120,42 +4840,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB23_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB23_2 ; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5188,13 +4906,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_cbranch_execz .LBB23_2 ; GFX7-NEXT: .LBB23_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen @@ -5214,29 +4930,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -5245,7 +4959,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5254,16 +4968,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -5295,14 +5007,12 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_cbranch_execz .LBB24_2 ; GFX942-NEXT: .LBB24_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX942-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX942-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -5310,29 +5020,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5340,57 +5048,53 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB24_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_4 ; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_2 ; GFX10-NEXT: ; %bb.3: ; %Flow ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: .LBB24_4: ; %Flow2 ; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_6 @@ -5398,18 +5102,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -5417,138 +5120,134 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX90A-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB24_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB24_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB24_6 +; GFX908-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB24_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_min_f64 v[6:7], v[8:9], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB24_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB24_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB24_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB24_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB24_2 +; GFX908-NEXT: .LBB24_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi +; GFX908-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB24_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB24_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_6 @@ -5556,17 +5255,16 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5575,37 +5273,35 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_4 ; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[2:3] -; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[4:5] +; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX7-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB24_2 ; GFX7-NEXT: ; %bb.3: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX7-NEXT: .LBB24_4: ; %Flow2 ; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_6 @@ -5613,17 +5309,16 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -5638,29 +5333,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -5669,7 +5362,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5678,16 +5371,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -5719,14 +5410,12 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX942-NEXT: s_cbranch_execz .LBB25_2 ; GFX942-NEXT: .LBB25_4: ; %atomicrmw.private ; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX942-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX942-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -5734,29 +5423,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5764,22 +5451,20 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB25_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -5812,16 +5497,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX10-NEXT: s_cbranch_execz .LBB25_2 ; GFX10-NEXT: .LBB25_4: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5853,15 +5536,13 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX90A-NEXT: s_cbranch_execz .LBB25_2 ; GFX90A-NEXT: .LBB25_4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5869,90 +5550,89 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB25_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB25_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB25_6 +; GFX908-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_min_f64 v[6:7], v[8:9], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB25_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB25_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB25_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB25_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB25_2 +; GFX908-NEXT: .LBB25_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi +; GFX908-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB25_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB25_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_6 @@ -5960,17 +5640,16 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6003,16 +5682,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX7-NEXT: s_cbranch_execz .LBB25_2 ; GFX7-NEXT: .LBB25_4: ; %atomicrmw.private ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: v_min_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -6033,9 +5710,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6050,11 +5726,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6079,9 +5753,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6095,12 +5768,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6129,12 +5801,10 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -6157,9 +5827,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6174,11 +5843,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6199,9 +5866,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6215,12 +5881,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6241,7 +5906,6 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6254,7 +5918,6 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -6284,12 +5947,10 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -6311,33 +5972,31 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v5, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -6352,13 +6011,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6421,35 +6078,30 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6460,7 +6112,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6473,15 +6125,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6489,12 +6139,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6518,69 +6167,62 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6591,19 +6233,18 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6615,12 +6256,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6642,10 +6282,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6655,7 +6294,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -6676,36 +6314,34 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6714,33 +6350,31 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v5, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6756,13 +6390,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6827,35 +6459,30 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6866,7 +6493,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6879,15 +6506,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6895,12 +6520,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6925,69 +6549,62 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6998,19 +6615,18 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7022,12 +6638,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -7049,10 +6664,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7062,7 +6676,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -7083,36 +6696,34 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7121,33 +6732,31 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v5, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7163,13 +6772,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7235,9 +6842,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7251,10 +6857,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7279,9 +6884,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7294,12 +6898,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7328,12 +6930,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -7355,9 +6955,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7371,10 +6970,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7395,9 +6993,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7410,12 +7007,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7436,7 +7031,6 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -7448,7 +7042,6 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -7478,12 +7071,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -7511,12 +7102,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -7544,12 +7133,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7611,37 +7198,33 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -7658,37 +7241,33 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -7705,30 +7284,28 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7738,38 +7315,34 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7781,37 +7354,34 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7823,31 +7393,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -7858,31 +7426,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7892,31 +7458,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7926,32 +7490,30 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8005,37 +7567,33 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -8052,37 +7610,33 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -8100,30 +7654,28 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB31_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8133,38 +7685,34 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8176,37 +7724,34 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8218,31 +7763,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -8253,31 +7796,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8287,31 +7828,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8321,32 +7860,30 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8401,7 +7938,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8409,9 +7945,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8436,17 +7970,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8466,36 +7998,33 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_min_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8503,9 +8032,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8526,17 +8053,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v4, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8558,23 +8083,21 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_min_f16_e32 v0, v1, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -8585,27 +8108,25 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX90A-NEXT: v_min_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8613,14 +8134,12 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f16_e32 v3, v4, v2 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8641,19 +8160,17 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f16_e32 v0, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -8702,15 +8219,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8734,24 +8249,21 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -8764,24 +8276,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_min_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8792,15 +8302,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8820,25 +8328,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8852,23 +8357,21 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_min_f16_e32 v3, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -8879,22 +8382,20 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX90A-NEXT: v_min_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8904,22 +8405,20 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX908-NEXT: v_min_f16_e32 v3, v4, v2 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8931,22 +8430,20 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f16_e32 v3, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8993,36 +8490,31 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9033,7 +8525,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9046,15 +8538,13 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9062,12 +8552,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -9092,69 +8581,62 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB34_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -9165,19 +8647,18 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9189,12 +8670,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -9216,10 +8696,9 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -9229,7 +8708,6 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -9250,39 +8728,37 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9291,33 +8767,31 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: flat_load_dword v5, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9333,13 +8807,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -9404,38 +8876,34 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -9452,38 +8920,34 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -9500,30 +8964,28 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB35_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9533,38 +8995,34 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9576,37 +9034,34 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9618,31 +9073,29 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -9653,34 +9106,32 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9690,31 +9141,29 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9724,32 +9173,30 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10230,7 +9677,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_mov_b32_e32 v6, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 @@ -10696,7 +10142,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_mov_b32_e32 v6, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 @@ -11164,7 +10609,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_mov_b32_e32 v6, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 @@ -11603,7 +11047,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 @@ -12055,7 +11498,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -12509,7 +11951,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -12884,7 +12325,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -13245,7 +12685,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -13716,7 +13155,6 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_mov_b32_e32 v6, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 @@ -14174,7 +13612,6 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -14209,15 +13646,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14236,43 +13671,38 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB46_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14292,14 +13722,12 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14317,25 +13745,23 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -14343,13 +13769,11 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14367,21 +13791,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -14440,15 +13860,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14467,43 +13885,38 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB47_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14524,21 +13937,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_min_f16 v0, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB47_1 @@ -14549,25 +13960,23 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14575,13 +13984,11 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14601,21 +14008,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -14676,15 +14079,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14707,25 +14108,22 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v0, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX942-NEXT: v_pk_min_f16 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14738,22 +14136,20 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX11-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v0, v1, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14767,21 +14163,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_min_f16 v0, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB48_1 @@ -14799,12 +14193,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX90A-NEXT: v_pk_min_f16 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -14827,17 +14219,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX908-NEXT: v_pk_min_f16 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_pk_min_f16 v0, v1, v2 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB48_1 @@ -14852,21 +14242,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -14926,21 +14312,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -14953,23 +14336,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB49_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14979,22 +14359,19 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15006,21 +14383,19 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -15031,20 +14406,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15054,20 +14427,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15077,24 +14448,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15147,21 +14514,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -15174,23 +14538,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15200,22 +14561,19 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15229,21 +14587,19 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -15254,20 +14610,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15277,20 +14631,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15302,24 +14654,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15375,21 +14723,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -15406,25 +14751,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15437,22 +14779,19 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15466,21 +14805,19 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -15498,12 +14835,10 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX90A-NEXT: v_pk_min_f16 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -15526,12 +14861,10 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX908-NEXT: v_pk_min_f16 v0, v1, v2 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15549,24 +14882,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15623,15 +14952,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -15651,43 +14978,38 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX942-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15708,21 +15030,19 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_min_f16 v0, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -15733,28 +15053,26 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15762,13 +15080,11 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15788,21 +15104,17 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -15862,22 +15174,19 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -15890,23 +15199,20 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB53_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15916,22 +15222,19 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15945,21 +15248,19 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -15970,23 +15271,21 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15996,20 +15295,18 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16021,24 +15318,20 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 54fb38ba877ad..2be1c5c3c31ee 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -12356,13 +12356,10 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 @@ -12430,10 +12427,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 -; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -12508,12 +12503,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] @@ -12581,9 +12573,7 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 -; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -12649,13 +12639,10 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 @@ -12723,10 +12710,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 -; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX950-SDAG-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX950-SDAG-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -12801,12 +12786,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] @@ -12874,9 +12856,7 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 -; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX950-SDAG-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -12996,23 +12976,22 @@ define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v5, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v5, v0 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v1, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v1 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmax_f32_saddr_rtn: @@ -13058,19 +13037,17 @@ define void @flat_atomic_fmax_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v5, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v5, v0 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v1, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v1 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13119,23 +13096,22 @@ define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v5, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX950-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-SDAG-NEXT: v_min_f32_e32 v4, v5, v0 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v1, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v1 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmin_f32_saddr_rtn: @@ -13181,19 +13157,17 @@ define void @flat_atomic_fmin_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v5, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX950-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-SDAG-NEXT: v_min_f32_e32 v4, v5, v0 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v1, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v1 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13292,57 +13266,82 @@ define void @flat_atomic_fadd_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data } define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { -; GFX1250-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX1250-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40 -; GFX1250-NEXT: .LBB124_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v5, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX1250-NEXT: v_pk_max_num_f16 v4, v0, v1 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1250-NEXT: s_cbranch_execnz .LBB124_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0 +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-SDAG-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v5, v1 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v4, v5, v0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX1250-SDAG-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB124_1 +; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-GISEL-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40 +; GFX1250-GISEL-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v4, v0, v1 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX1250-GISEL-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB124_1 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v5, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v0, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v5, v0 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v1, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v1 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: @@ -13375,52 +13374,74 @@ define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> % } define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) { -; GFX1250-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, 0 -; GFX1250-NEXT: v_pk_max_num_f16 v3, v0, v0 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 -; GFX1250-NEXT: .LBB125_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v3 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1250-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1250-NEXT: s_cbranch_execnz .LBB125_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0 +; GFX1250-SDAG-NEXT: flat_load_b32 v3, v1, s[0:1] offset:40 +; GFX1250-SDAG-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_max_num_f16 v2, v3, v0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-SDAG-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB125_1 +; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v3, v0, v0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-GISEL-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v0, v0, v3 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-GISEL-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB125_1 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v5, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v0, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v5, v0 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v1, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v1 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13456,57 +13477,82 @@ define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data } define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { -; GFX1250-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX1250-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40 -; GFX1250-NEXT: .LBB126_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v5, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX1250-NEXT: v_pk_min_num_f16 v4, v0, v1 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1250-NEXT: s_cbranch_execnz .LBB126_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0 +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-SDAG-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v5, v1 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_pk_min_num_f16 v4, v5, v0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX1250-SDAG-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB126_1 +; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-GISEL-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40 +; GFX1250-GISEL-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX1250-GISEL-NEXT: v_pk_min_num_f16 v4, v0, v1 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX1250-GISEL-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB126_1 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v5, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: v_pk_min_f16 v0, v0, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-SDAG-NEXT: v_pk_min_f16 v4, v5, v0 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v1, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v1 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v1 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: @@ -13539,52 +13585,74 @@ define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> % } define void @flat_atomic_fmin_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) { -; GFX1250-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v2, 0 -; GFX1250-NEXT: v_pk_max_num_f16 v3, v0, v0 -; GFX1250-NEXT: s_mov_b32 s2, 0 -; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 -; GFX1250-NEXT: .LBB127_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_pk_min_num_f16 v0, v0, v3 -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1250-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1250-NEXT: s_cbranch_execnz .LBB127_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0 +; GFX1250-SDAG-NEXT: flat_load_b32 v3, v1, s[0:1] offset:40 +; GFX1250-SDAG-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_pk_min_num_f16 v2, v3, v0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-SDAG-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB127_1 +; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v3, v0, v0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-GISEL-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_min_num_f16 v0, v0, v3 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-GISEL-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB127_1 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v5, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: v_pk_min_f16 v0, v0, v4 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-SDAG-NEXT: v_pk_min_f16 v4, v5, v0 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v1, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v1 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB127_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 4368f2a5de3b1..cb5e5655898de 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -163,7 +163,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX1250-LABEL: test_fmax3_olt_0_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-NEXT: s_mov_b32 s6, s2 @@ -354,7 +354,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX1250-LABEL: test_fmax3_olt_1_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-NEXT: s_mov_b32 s6, s2 @@ -445,20 +445,17 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e32 v1, v2, v2 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v0, v0, v2 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; @@ -615,7 +612,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX1250-TRUE16-LABEL: test_fmax3_olt_0_f16: ; GFX1250-TRUE16: ; %bb.0: ; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 @@ -646,7 +643,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX1250-FAKE16-LABEL: test_fmax3_olt_0_f16: ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 @@ -738,20 +735,17 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e32 v1, v2, v2 -; VI-NEXT: v_max_f16_e32 v0, v1, v0 +; VI-NEXT: v_max_f16_e32 v0, v2, v0 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; @@ -908,7 +902,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX1250-TRUE16-LABEL: test_fmax3_olt_1_f16: ; GFX1250-TRUE16: ; %bb.0: ; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 @@ -939,7 +933,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX1250-FAKE16-LABEL: test_fmax3_olt_1_f16: ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index 1b494deca08aa..f980288865b8b 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -289,8 +289,6 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16_fast(<3 x half> %a, <3 x half> %b) ; GFX9-LABEL: test_fmax_legacy_ugt_v3f16_fast: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -328,10 +326,7 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16_fast(<3 x half> %a, <3 x half> %b) ; GFX11-LABEL: test_fmax_legacy_ugt_v3f16_fast: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt <3 x half> %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 668347eb97004..e9473db44f23e 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -5524,22 +5524,18 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) ; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) -; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s10, -1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-SDAG-NEXT: v_min_f32_e32 v5, v2, v3 ; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: buffer_store_dword v5, off, s[8:11], 0 @@ -5584,35 +5580,32 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 -; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 -; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: flat_load_dword v6, v[0:1] glc ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) ; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) -; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 ; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VI-SDAG-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 -; VI-SDAG-NEXT: flat_store_dword v[0:1], v5 +; VI-SDAG-NEXT: v_min_f32_e32 v4, v6, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v6, v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v4 ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 ; VI-SDAG-NEXT: s_endpgm ; @@ -5655,51 +5648,93 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v[0:1], v4, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-SDAG-NEXT: global_store_dword v[0:1], v4, off +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 -; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4 -; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: global_store_dword v[0:1], v4, off +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-SDAG-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v3, v4 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4 +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -5726,22 +5761,18 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) ; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) -; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s10, -1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-SDAG-NEXT: v_min_f32_e32 v5, v2, v3 ; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: buffer_store_dword v2, off, s[8:11], 0 @@ -5786,35 +5817,32 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 -; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 -; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: flat_load_dword v6, v[0:1] glc ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) ; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) -; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VI-SDAG-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: v_min_f32_e32 v4, v6, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v6, v2 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) ; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 ; VI-SDAG-NEXT: s_endpgm ; @@ -5857,28 +5885,48 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v[0:1], v1, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-SDAG-NEXT: global_store_dword v[0:1], v1, off +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: global_store_dword v[0:1], v1, off +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX11-SDAG: ; %bb.0: @@ -5893,12 +5941,9 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX11-SDAG-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3 ; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5953,22 +5998,18 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) ; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) -; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s10, -1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-SDAG-NEXT: v_min_f32_e32 v5, v2, v3 ; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 @@ -6013,35 +6054,32 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 -; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 -; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: flat_load_dword v6, v[0:1] glc ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) ; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) -; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VI-SDAG-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: v_min_f32_e32 v4, v6, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v6, v2 ; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) -; VI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 ; VI-SDAG-NEXT: s_endpgm ; @@ -6084,51 +6122,92 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: global_store_dword v[0:1], v1, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-SDAG-NEXT: global_store_dword v[0:1], v1, off +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_maxmin_f32 v3, v1, v2, v3 -; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: global_store_dword v[0:1], v1, off +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -6163,15 +6242,12 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) ; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[6:7] -; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-SDAG-NEXT: v_min_f32_e32 v5, v2, v3 ; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 ; SI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-SDAG-NEXT: s_endpgm @@ -6227,13 +6303,10 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-SDAG-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-SDAG-NEXT: v_min_f32_e32 v4, v6, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v6, v2 ; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 ; VI-SDAG-NEXT: s_endpgm ; @@ -6274,26 +6347,44 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_safe_med3_f32_pat0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[8:9] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_safe_med3_f32_pat0: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_safe_med3_f32_pat0: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; GFX11-SDAG: ; %bb.0: @@ -6308,10 +6399,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 -; GFX11-SDAG-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -7632,8 +7721,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-SDAG-NEXT: v_min_f32_e32 v5, v2, v3 ; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 142bdd42b2c00..b94370da69ec8 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -163,7 +163,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX1250-LABEL: test_fmin3_olt_0_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-NEXT: s_mov_b32 s6, s2 @@ -354,7 +354,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX1250-LABEL: test_fmin3_olt_1_f32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-NEXT: s_mov_b32 s6, s2 @@ -445,20 +445,17 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e32 v1, v2, v2 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_min_f16_e32 v0, v0, v2 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; @@ -615,7 +612,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX1250-TRUE16-LABEL: test_fmin3_olt_0_f16: ; GFX1250-TRUE16: ; %bb.0: ; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 @@ -646,7 +643,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX1250-FAKE16-LABEL: test_fmin3_olt_0_f16: ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 @@ -738,20 +735,17 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e32 v1, v2, v2 -; VI-NEXT: v_min_f16_e32 v0, v1, v0 +; VI-NEXT: v_min_f16_e32 v0, v2, v0 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; @@ -908,7 +902,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX1250-TRUE16-LABEL: test_fmin3_olt_1_f16: ; GFX1250-TRUE16: ; %bb.0: ; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 @@ -939,7 +933,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX1250-FAKE16-LABEL: test_fmin3_olt_1_f16: ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 @@ -1080,28 +1074,25 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; SI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; SI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -1131,11 +1122,8 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; @@ -1163,11 +1151,8 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s8 ; GFX9-NEXT: s_mov_b32 s1, s9 -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1189,18 +1174,17 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc +; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ; @@ -1222,25 +1206,24 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s6 -; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s4, s6 +; GFX12-NEXT: s_mov_b32 s5, s7 +; GFX12-NEXT: s_mov_b32 s6, s10 +; GFX12-NEXT: s_mov_b32 s7, s11 ; GFX12-NEXT: s_mov_b32 s8, s0 -; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s9, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: test_fmin3_olt_0_f64: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-NEXT: s_mov_b32 s6, s2 @@ -1262,12 +1245,9 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s1, s9 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 @@ -1290,28 +1270,25 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; SI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; SI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; SI-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; SI-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -1341,11 +1318,8 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; VI-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; VI-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_endpgm ; @@ -1373,11 +1347,8 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s8 ; GFX9-NEXT: s_mov_b32 s1, s9 -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX9-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1399,18 +1370,17 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc +; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ; @@ -1432,25 +1402,24 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s6 -; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s4, s6 +; GFX12-NEXT: s_mov_b32 s5, s7 +; GFX12-NEXT: s_mov_b32 s6, s10 +; GFX12-NEXT: s_mov_b32 s7, s11 ; GFX12-NEXT: s_mov_b32 s8, s0 -; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s9, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: test_fmin3_olt_1_f64: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-NEXT: s_mov_b32 s6, s2 @@ -1472,12 +1441,9 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s1, s9 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 8c9dccceff192..41b7b7fe6b21a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -290,8 +290,6 @@ define <3 x half> @test_fmin_legacy_ule_v3f16_fast(<3 x half> %a, <3 x half> %b) ; GFX9-LABEL: test_fmin_legacy_ule_v3f16_fast: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -329,10 +327,7 @@ define <3 x half> @test_fmin_legacy_ule_v3f16_fast(<3 x half> %a, <3 x half> %b) ; GFX11-LABEL: test_fmin_legacy_ule_v3f16_fast: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <3 x half> %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index 13e4206ab7f57..2b17db2efebb9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -961,18 +961,13 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 { ; VI-LABEL: v_fneg_minnum_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, -v1, -v1 -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e64 v0, -v0, -v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_minnum_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 -; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half %a, half %b) %fneg = fneg half %min @@ -1063,16 +1058,13 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 { ; VI-LABEL: v_fneg_posk_minnum_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_max_f16_e32 v0, -4.0, v0 +; VI-NEXT: v_max_f16_e64 v0, -v0, -4.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_posk_minnum_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, -4.0, v0 +; GFX11-NEXT: v_max_f16_e64 v0, -v0, -4.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half 4.0, half %a) %fneg = fneg half %min @@ -1116,16 +1108,13 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 { ; VI-LABEL: v_fneg_negk_minnum_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 +; VI-NEXT: v_max_f16_e64 v0, -v0, 4.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_negk_minnum_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0 +; GFX11-NEXT: v_max_f16_e64 v0, -v0, 4.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half -4.0, half %a) %fneg = fneg half %min @@ -1198,16 +1187,13 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 { ; VI-LABEL: v_fneg_neg0_minnum_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_max_f16_e32 v0, 0, v0 +; VI-NEXT: v_max_f16_e64 v0, -v0, 0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_neg0_minnum_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, 0, v0 +; GFX11-NEXT: v_max_f16_e64 v0, -v0, 0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half -0.0, half %a) %fneg = fneg half %min @@ -1226,7 +1212,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; VI-LABEL: v_fneg_inv2pi_minnum_f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v0, 0.15915494, v0 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1234,9 +1219,8 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; GFX11-LABEL: v_fneg_inv2pi_minnum_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half 0xH3118, half %a) @@ -1256,7 +1240,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; VI-LABEL: v_fneg_neg_inv2pi_minnum_f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v0, 0.15915494, v0 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1264,9 +1247,8 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; GFX11-LABEL: v_fneg_neg_inv2pi_minnum_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half 0xH3118, half %a) @@ -1315,7 +1297,6 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; VI-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v0, 0, v0 ; VI-NEXT: v_mul_f16_e64 v0, -v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1323,9 +1304,8 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; GFX11-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f16_e32 v0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half 0.0, half %a) @@ -1350,7 +1330,6 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 { ; VI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v0, 0.15915494, v0 ; VI-NEXT: v_mul_f16_e64 v0, -v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1358,9 +1337,8 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 { ; GFX11-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half 0xH3118, half %a) @@ -1418,19 +1396,15 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) ; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, -v1, -v1 -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e64 v0, -v0, -v1 ; VI-NEXT: v_mul_f16_e32 v1, -4.0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 -; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half %a, half %b) @@ -1528,18 +1502,13 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 { ; VI-LABEL: v_fneg_maxnum_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, -v1, -v1 -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_min_f16_e64 v0, -v0, -v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_maxnum_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 -; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_min_f16_e64 v0, -v0, -v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half %a, half %b) %fneg = fneg half %max @@ -1630,16 +1599,13 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 { ; VI-LABEL: v_fneg_posk_maxnum_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_min_f16_e32 v0, -4.0, v0 +; VI-NEXT: v_min_f16_e64 v0, -v0, -4.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_posk_maxnum_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v0, -4.0, v0 +; GFX11-NEXT: v_min_f16_e64 v0, -v0, -4.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half 4.0, half %a) %fneg = fneg half %max @@ -1683,16 +1649,13 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 { ; VI-LABEL: v_fneg_negk_maxnum_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 +; VI-NEXT: v_min_f16_e64 v0, -v0, 4.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_negk_maxnum_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0 +; GFX11-NEXT: v_min_f16_e64 v0, -v0, 4.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half -4.0, half %a) %fneg = fneg half %max @@ -1765,16 +1728,13 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 { ; VI-LABEL: v_fneg_neg0_maxnum_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_min_f16_e32 v0, 0, v0 +; VI-NEXT: v_min_f16_e64 v0, -v0, 0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_neg0_maxnum_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v0, 0, v0 +; GFX11-NEXT: v_min_f16_e64 v0, -v0, 0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half -0.0, half %a) %fneg = fneg half %max @@ -1822,7 +1782,6 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; VI-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v0, 0, v0 ; VI-NEXT: v_mul_f16_e64 v0, -v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1830,9 +1789,8 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; GFX11-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half 0.0, half %a) @@ -1890,19 +1848,15 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) ; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v1, -v1, -v1 -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_min_f16_e64 v0, -v0, -v1 ; VI-NEXT: v_mul_f16_e32 v1, -4.0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 -; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_min_f16_e64 v0, -v0, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half %a, half %b) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index cfa5247267559..3d852a83f69b9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1606,9 +1606,7 @@ define amdgpu_kernel void @v_fneg_minnum_f32_ieee(ptr addrspace(1) %out, ptr add ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v3, -1.0, v5 -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-NEXT: v_max_f32_e64 v2, -v5, -v2 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -1631,9 +1629,7 @@ define amdgpu_kernel void @v_fneg_minnum_f32_ieee(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v3, -1.0, v5 -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; VI-NEXT: v_max_f32_e32 v2, v3, v2 +; VI-NEXT: v_max_f32_e64 v2, -v5, -v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1673,8 +1669,7 @@ define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(ptr addrspace(1) %out, pt ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; SI-NEXT: v_max_f32_e32 v2, v2, v2 +; SI-NEXT: v_max_f32_e64 v2, -v3, -v3 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -1691,8 +1686,7 @@ define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(ptr addrspace(1) %out, pt ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; VI-NEXT: v_max_f32_e32 v2, v2, v2 +; VI-NEXT: v_max_f32_e64 v2, -v3, -v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1730,8 +1724,7 @@ define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(ptr addrspace(1) %out, pt ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; SI-NEXT: v_max_f32_e32 v2, -4.0, v2 +; SI-NEXT: v_max_f32_e64 v2, -v3, -4.0 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -1748,8 +1741,7 @@ define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(ptr addrspace(1) %out, pt ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; VI-NEXT: v_max_f32_e32 v2, -4.0, v2 +; VI-NEXT: v_max_f32_e64 v2, -v3, -4.0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1787,8 +1779,7 @@ define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(ptr addrspace(1) %out, pt ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_max_f32_e64 v2, -v3, 4.0 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -1805,8 +1796,7 @@ define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(ptr addrspace(1) %out, pt ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; VI-NEXT: v_max_f32_e32 v2, 4.0, v2 +; VI-NEXT: v_max_f32_e64 v2, -v3, 4.0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1891,8 +1881,7 @@ define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(ptr addrspace(1) %out, pt ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_max_f32_e64 v2, -v3, 0 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -1909,8 +1898,7 @@ define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(ptr addrspace(1) %out, pt ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; VI-NEXT: v_max_f32_e32 v2, 0, v2 +; VI-NEXT: v_max_f32_e64 v2, -v3, 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1937,9 +1925,9 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(ptr addrspace(1) %out, ptr a ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; SI-NEXT: s_mov_b32 s0, 0xbe22f983 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; SI-NEXT: v_max_f32_e32 v2, 0xbe22f983, v2 +; SI-NEXT: v_max_f32_e64 v2, -v3, s0 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -1953,11 +1941,10 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(ptr addrspace(1) %out, ptr a ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; VI-NEXT: v_min_f32_e32 v2, 0.15915494, v2 +; VI-NEXT: v_min_f32_e32 v2, 0.15915494, v3 ; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1985,9 +1972,9 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(ptr addrspace(1) %out, p ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; SI-NEXT: s_mov_b32 s0, 0x3e22f983 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; SI-NEXT: v_max_f32_e32 v2, 0x3e22f983, v2 +; SI-NEXT: v_max_f32_e64 v2, -v3, s0 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -2004,8 +1991,7 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(ptr addrspace(1) %out, p ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; VI-NEXT: v_max_f32_e32 v2, 0.15915494, v2 +; VI-NEXT: v_max_f32_e64 v2, -v3, 0.15915494 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2049,11 +2035,10 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(ptr addrspace(1) %out, ptr a ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v3, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_max_f16_e32 v2, v3, v3 -; VI-NEXT: v_min_f16_e32 v2, 0.15915494, v2 +; VI-NEXT: v_min_f16_e32 v2, 0.15915494, v3 ; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm @@ -2101,8 +2086,7 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(ptr addrspace(1) %out, p ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_max_f16_e64 v2, -v3, -v3 -; VI-NEXT: v_max_f16_e32 v2, 0.15915494, v2 +; VI-NEXT: v_max_f16_e64 v2, -v3, 0.15915494 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2132,8 +2116,7 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(ptr addrspace(1) %out, ptr a ; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SI-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] -; SI-NEXT: v_max_f64 v[0:1], v[0:1], s[2:3] +; SI-NEXT: v_max_f64 v[0:1], -v[0:1], s[2:3] ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm ; @@ -2150,7 +2133,6 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(ptr addrspace(1) %out, ptr a ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; VI-NEXT: v_min_f64 v[0:1], v[0:1], 0.15915494309189532 ; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2182,8 +2164,7 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(ptr addrspace(1) %out, p ; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SI-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] -; SI-NEXT: v_max_f64 v[0:1], v[0:1], s[2:3] +; SI-NEXT: v_max_f64 v[0:1], -v[0:1], s[2:3] ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm ; @@ -2200,8 +2181,7 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(ptr addrspace(1) %out, p ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] -; VI-NEXT: v_max_f64 v[0:1], v[0:1], 0.15915494309189532 +; VI-NEXT: v_max_f64 v[0:1], -v[0:1], 0.15915494309189532 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2230,24 +2210,23 @@ define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v3, s5 -; SI-NEXT: flat_load_dword v4, v[0:1] glc +; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: flat_load_dword v5, v[0:1] glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; SI-NEXT: flat_load_dword v3, v[0:1] glc +; SI-NEXT: flat_load_dword v2, v[2:3] glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; SI-NEXT: v_min_f32_e32 v2, 0, v2 -; SI-NEXT: v_mul_f32_e64 v2, -v2, v3 +; SI-NEXT: v_min_f32_e32 v3, 0, v5 +; SI-NEXT: v_mul_f32_e64 v2, -v3, v2 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -2255,24 +2234,23 @@ define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(ptr addrspace(1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_load_dword v4, v[0:1] glc +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] glc +; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; VI-NEXT: v_min_f32_e32 v2, 0, v2 -; VI-NEXT: v_mul_f32_e64 v2, -v2, v3 +; VI-NEXT: v_min_f32_e32 v3, 0, v5 +; VI-NEXT: v_mul_f32_e64 v2, -v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2294,24 +2272,24 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v3, s5 -; SI-NEXT: flat_load_dword v4, v[0:1] glc +; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: flat_load_dword v5, v[0:1] glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; SI-NEXT: flat_load_dword v3, v[0:1] glc +; SI-NEXT: flat_load_dword v2, v[2:3] glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 +; SI-NEXT: s_mov_b32 s0, 0xbe22f983 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v4 -; SI-NEXT: v_max_f32_e32 v2, 0xbe22f983, v2 -; SI-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-NEXT: v_max_f32_e64 v3, -v5, s0 +; SI-NEXT: v_mul_f32_e32 v2, v3, v2 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -2319,24 +2297,23 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(ptr addrspace(1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_load_dword v4, v[0:1] glc +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] glc +; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; VI-NEXT: v_min_f32_e32 v2, 0.15915494, v2 -; VI-NEXT: v_mul_f32_e64 v2, -v2, v3 +; VI-NEXT: v_min_f32_e32 v3, 0.15915494, v5 +; VI-NEXT: v_mul_f32_e64 v2, -v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2384,9 +2361,7 @@ define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(ptr addrspace ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mul_f32_e32 v3, -1.0, v4 -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-NEXT: v_max_f32_e64 v2, -v4, -v2 ; SI-NEXT: v_mul_f32_e32 v3, -4.0, v2 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -2412,9 +2387,7 @@ define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mul_f32_e32 v3, -1.0, v4 -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; VI-NEXT: v_max_f32_e32 v2, v3, v2 +; VI-NEXT: v_max_f32_e64 v2, -v4, -v2 ; VI-NEXT: v_mul_f32_e32 v3, -4.0, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2474,9 +2447,7 @@ define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(ptr addrspace(1) %out, ptr add ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v3, -1.0, v5 -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-NEXT: v_min_f32_e64 v2, -v5, -v2 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -2499,9 +2470,7 @@ define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v3, -1.0, v5 -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; VI-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-NEXT: v_min_f32_e64 v2, -v5, -v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2541,8 +2510,7 @@ define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(ptr addrspace(1) %out, pt ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; SI-NEXT: v_min_f32_e32 v2, v2, v2 +; SI-NEXT: v_min_f32_e64 v2, -v3, -v3 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -2559,8 +2527,7 @@ define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(ptr addrspace(1) %out, pt ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; VI-NEXT: v_min_f32_e32 v2, v2, v2 +; VI-NEXT: v_min_f32_e64 v2, -v3, -v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2598,8 +2565,7 @@ define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(ptr addrspace(1) %out, pt ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; SI-NEXT: v_min_f32_e32 v2, -4.0, v2 +; SI-NEXT: v_min_f32_e64 v2, -v3, -4.0 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -2616,8 +2582,7 @@ define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(ptr addrspace(1) %out, pt ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; VI-NEXT: v_min_f32_e32 v2, -4.0, v2 +; VI-NEXT: v_min_f32_e64 v2, -v3, -4.0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2655,8 +2620,7 @@ define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(ptr addrspace(1) %out, pt ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_min_f32_e64 v2, -v3, 4.0 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -2673,8 +2637,7 @@ define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(ptr addrspace(1) %out, pt ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; VI-NEXT: v_min_f32_e32 v2, 4.0, v2 +; VI-NEXT: v_min_f32_e64 v2, -v3, 4.0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2759,8 +2722,7 @@ define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(ptr addrspace(1) %out, pt ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; SI-NEXT: v_min_f32_e32 v2, 0, v2 +; SI-NEXT: v_min_f32_e64 v2, -v3, 0 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -2777,8 +2739,7 @@ define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(ptr addrspace(1) %out, pt ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v3 -; VI-NEXT: v_min_f32_e32 v2, 0, v2 +; VI-NEXT: v_min_f32_e64 v2, -v3, 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2807,24 +2768,23 @@ define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v3, s5 -; SI-NEXT: flat_load_dword v4, v[0:1] glc +; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: flat_load_dword v5, v[0:1] glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; SI-NEXT: flat_load_dword v3, v[0:1] glc +; SI-NEXT: flat_load_dword v2, v[2:3] glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; SI-NEXT: v_max_f32_e32 v2, 0, v2 -; SI-NEXT: v_mul_f32_e64 v2, -v2, v3 +; SI-NEXT: v_max_f32_e32 v3, 0, v5 +; SI-NEXT: v_mul_f32_e64 v2, -v3, v2 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -2832,24 +2792,23 @@ define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(ptr addrspace(1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_load_dword v4, v[0:1] glc +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] glc +; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; VI-NEXT: v_max_f32_e32 v2, 0, v2 -; VI-NEXT: v_mul_f32_e64 v2, -v2, v3 +; VI-NEXT: v_max_f32_e32 v3, 0, v5 +; VI-NEXT: v_mul_f32_e64 v2, -v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2897,9 +2856,7 @@ define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(ptr addrspace ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mul_f32_e32 v3, -1.0, v4 -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-NEXT: v_min_f32_e64 v2, -v4, -v2 ; SI-NEXT: v_mul_f32_e32 v3, -4.0, v2 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -2925,9 +2882,7 @@ define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mul_f32_e32 v3, -1.0, v4 -; VI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; VI-NEXT: v_min_f32_e32 v2, v3, v2 +; VI-NEXT: v_min_f32_e64 v2, -v4, -v2 ; VI-NEXT: v_mul_f32_e32 v3, -4.0, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index e1fef9083e132..db37748ebb33d 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -895,9 +895,7 @@ define float @v_fneg_minnum_f32_ieee(float %a, float %b) #0 { ; GCN-LABEL: v_fneg_minnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_max_f32_e32 v0, v0, v1 +; GCN-NEXT: v_max_f32_e64 v0, -v0, -v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float %a, float %b) %fneg = fneg float %min @@ -941,8 +939,7 @@ define float @v_fneg_posk_minnum_f32_ieee(float %a) #0 { ; GCN-LABEL: v_fneg_posk_minnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_max_f32_e32 v0, -4.0, v0 +; GCN-NEXT: v_max_f32_e64 v0, -v0, -4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float 4.0, float %a) %fneg = fneg float %min @@ -964,8 +961,7 @@ define float @v_fneg_negk_minnum_f32_ieee(float %a) #0 { ; GCN-LABEL: v_fneg_negk_minnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_max_f32_e64 v0, -v0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float -4.0, float %a) %fneg = fneg float %min @@ -999,8 +995,7 @@ define float @v_fneg_neg0_minnum_f32_ieee(float %a) #0 { ; GCN-LABEL: v_fneg_neg0_minnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_max_f32_e32 v0, 0, v0 +; GCN-NEXT: v_max_f32_e64 v0, -v0, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float -0.0, float %a) %fneg = fneg float %min @@ -1011,14 +1006,13 @@ define float @v_fneg_inv2pi_minnum_f32(float %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; SI-NEXT: v_max_f32_e32 v0, 0xbe22f983, v0 +; SI-NEXT: s_mov_b32 s4, 0xbe22f983 +; SI-NEXT: v_max_f32_e64 v0, -v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_inv2pi_minnum_f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; VI-NEXT: v_min_f32_e32 v0, 0.15915494, v0 ; VI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1031,15 +1025,14 @@ define float @v_fneg_neg_inv2pi_minnum_f32(float %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; SI-NEXT: v_max_f32_e32 v0, 0x3e22f983, v0 +; SI-NEXT: s_mov_b32 s4, 0x3e22f983 +; SI-NEXT: v_max_f32_e64 v0, -v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_neg_inv2pi_minnum_f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; VI-NEXT: v_max_f32_e32 v0, 0.15915494, v0 +; VI-NEXT: v_max_f32_e64 v0, -v0, 0.15915494 ; VI-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a) %fneg = fneg float %min @@ -1058,7 +1051,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; VI-LABEL: v_fneg_inv2pi_minnum_f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v0, 0.15915494, v0 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1079,8 +1071,7 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; VI-LABEL: v_fneg_neg_inv2pi_minnum_f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_max_f16_e32 v0, 0.15915494, v0 +; VI-NEXT: v_max_f16_e64 v0, -v0, 0.15915494 ; VI-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half 0xHB118, half %a) %fneg = fneg half %min @@ -1091,16 +1082,14 @@ define double @v_fneg_inv2pi_minnum_f64(double %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] ; SI-NEXT: s_mov_b32 s4, 0x6dc9c882 ; SI-NEXT: s_mov_b32 s5, 0xbfc45f30 -; SI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] +; SI-NEXT: v_max_f64 v[0:1], -v[0:1], s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_inv2pi_minnum_f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; VI-NEXT: v_min_f64 v[0:1], v[0:1], 0.15915494309189532 ; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1113,17 +1102,15 @@ define double @v_fneg_neg_inv2pi_minnum_f64(double %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] ; SI-NEXT: s_mov_b32 s4, 0x6dc9c882 ; SI-NEXT: s_mov_b32 s5, 0x3fc45f30 -; SI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] +; SI-NEXT: v_max_f64 v[0:1], -v[0:1], s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_neg_inv2pi_minnum_f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] -; VI-NEXT: v_max_f64 v[0:1], v[0:1], 0.15915494309189532 +; VI-NEXT: v_max_f64 v[0:1], -v[0:1], 0.15915494309189532 ; VI-NEXT: s_setpc_b64 s[30:31] %min = call double @llvm.minnum.f64(double 0xbfc45f306dc9c882, double %a) %fneg = fneg double %min @@ -1145,7 +1132,6 @@ define float @v_fneg_0_minnum_foldable_use_f32_ieee(float %a, float %b) #0 { ; GCN-LABEL: v_fneg_0_minnum_foldable_use_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v0, 0, v0 ; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1159,15 +1145,14 @@ define float @v_fneg_inv2pi_minnum_foldable_use_f32(float %a, float %b) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; SI-NEXT: v_max_f32_e32 v0, 0xbe22f983, v0 +; SI-NEXT: s_mov_b32 s4, 0xbe22f983 +; SI-NEXT: v_max_f32_e64 v0, -v0, s4 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; VI-NEXT: v_min_f32_e32 v0, 0.15915494, v0 ; VI-NEXT: v_mul_f32_e64 v0, -v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1194,9 +1179,7 @@ define { float, float } @v_fneg_minnum_multi_use_minnum_f32_ieee(float %a, float ; GCN-LABEL: v_fneg_minnum_multi_use_minnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_max_f32_e32 v0, v0, v1 +; GCN-NEXT: v_max_f32_e64 v0, -v0, -v1 ; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float %a, float %b) @@ -1230,9 +1213,7 @@ define float @v_fneg_maxnum_f32_ieee(float %a, float %b) #0 { ; GCN-LABEL: v_fneg_maxnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: v_min_f32_e64 v0, -v0, -v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maxnum.f32(float %a, float %b) %fneg = fneg float %max @@ -1276,8 +1257,7 @@ define float @v_fneg_posk_maxnum_f32_ieee(float %a) #0 { ; GCN-LABEL: v_fneg_posk_maxnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, -4.0, v0 +; GCN-NEXT: v_min_f32_e64 v0, -v0, -4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maxnum.f32(float 4.0, float %a) %fneg = fneg float %max @@ -1299,8 +1279,7 @@ define float @v_fneg_negk_maxnum_f32_ieee(float %a) #0 { ; GCN-LABEL: v_fneg_negk_maxnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_min_f32_e64 v0, -v0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maxnum.f32(float -4.0, float %a) %fneg = fneg float %max @@ -1334,8 +1313,7 @@ define float @v_fneg_neg0_maxnum_f32_ieee(float %a) #0 { ; GCN-LABEL: v_fneg_neg0_maxnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, 0, v0 +; GCN-NEXT: v_min_f32_e64 v0, -v0, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maxnum.f32(float -0.0, float %a) %fneg = fneg float %max @@ -1357,7 +1335,6 @@ define float @v_fneg_0_maxnum_foldable_use_f32_ieee(float %a, float %b) #0 { ; GCN-LABEL: v_fneg_0_maxnum_foldable_use_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v0, 0, v0 ; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1384,9 +1361,7 @@ define { float, float } @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float %a, float ; GCN-LABEL: v_fneg_maxnum_multi_use_maxnum_f32_ieee: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: v_min_f32_e64 v0, -v0, -v1 ; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maxnum.f32(float %a, float %b) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 21762ff4222a9..929b5e6d49c13 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -32,27 +32,25 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -78,25 +76,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -104,13 +100,11 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -128,13 +122,11 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -194,27 +186,25 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -240,25 +230,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -266,13 +254,11 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -292,17 +278,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 @@ -358,27 +342,25 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -404,25 +386,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -430,13 +410,11 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -456,17 +434,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB2_1 @@ -522,22 +498,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -567,20 +541,18 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -590,20 +562,18 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -613,20 +583,18 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -678,22 +646,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -723,20 +689,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -746,20 +710,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -771,20 +733,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -837,22 +797,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -882,20 +840,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -905,20 +861,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -930,20 +884,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -997,27 +949,25 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1043,28 +993,26 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1072,13 +1020,11 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1098,17 +1044,15 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 @@ -1165,22 +1109,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX942-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1210,23 +1152,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1236,20 +1176,18 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1261,20 +1199,18 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1327,42 +1263,38 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1382,14 +1314,12 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1407,25 +1337,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1433,13 +1361,11 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1457,13 +1383,11 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1485,13 +1409,11 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc @@ -1515,14 +1437,12 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX6-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc @@ -1558,27 +1478,25 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1604,25 +1522,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1630,13 +1546,11 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1654,13 +1568,11 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1724,27 +1636,25 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1770,25 +1680,23 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1796,13 +1704,11 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1820,13 +1726,11 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1886,27 +1790,25 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1932,25 +1834,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1958,13 +1858,11 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1984,17 +1882,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 @@ -2050,27 +1946,25 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2096,25 +1990,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2122,13 +2014,11 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2148,17 +2038,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -2214,22 +2102,20 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2259,20 +2145,18 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2282,20 +2166,18 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2305,20 +2187,18 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2370,22 +2250,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2415,20 +2293,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2438,20 +2314,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2463,20 +2337,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2529,22 +2401,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2574,20 +2444,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2597,20 +2465,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2622,20 +2488,18 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2689,27 +2553,25 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2735,28 +2597,26 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2764,13 +2624,11 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2790,17 +2648,15 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -2857,22 +2713,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX942-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2902,23 +2756,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2928,20 +2780,18 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2953,20 +2803,18 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3015,15 +2863,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3053,15 +2899,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3099,15 +2943,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3125,15 +2967,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3188,15 +3028,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3226,15 +3064,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3272,15 +3108,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3300,15 +3134,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[8:9], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3362,15 +3194,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3400,15 +3230,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3446,15 +3274,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3474,15 +3300,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[8:9], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3535,21 +3359,18 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -3572,22 +3393,19 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3617,21 +3435,19 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3641,21 +3457,19 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3698,21 +3512,18 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -3735,22 +3546,19 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3780,21 +3588,19 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3806,21 +3612,19 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3864,21 +3668,18 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -3901,22 +3702,19 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3946,21 +3744,19 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3972,21 +3768,19 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4031,15 +3825,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4069,15 +3861,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4097,15 +3887,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4124,41 +3912,37 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4176,15 +3960,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4202,30 +3984,30 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[8:9], v[4:5], s[4:7], 0 addr64 -; GFX7-NEXT: v_max_f64 v[10:11], v[2:3], v[2:3] +; GFX7-NEXT: buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[10:11] -; GFX7-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-NEXT: v_mov_b32_e32 v3, v9 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_max_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: v_mov_b32_e32 v11, v1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4236,30 +4018,31 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[8:9], v[4:5], s[4:7], 0 addr64 -; GFX6-NEXT: v_max_f64 v[10:11], v[2:3], v[2:3] +; GFX6-NEXT: buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX6-NEXT: v_max_f64 v[6:7], v[0:1], v[10:11] -; GFX6-NEXT: v_mov_b32_e32 v0, v6 -; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v3, v9 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_max_f64 v[8:9], v[10:11], v[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: v_mov_b32_e32 v11, v1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v8, v0 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4279,15 +4062,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4317,15 +4098,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4363,15 +4142,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4389,15 +4166,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4456,9 +4231,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4473,11 +4247,9 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4502,9 +4274,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4518,12 +4289,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4552,12 +4322,10 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -4580,9 +4348,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4597,11 +4364,9 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -4622,9 +4387,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4638,12 +4402,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -4664,7 +4427,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -4677,7 +4439,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -4707,12 +4468,10 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -4734,33 +4493,31 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -4775,13 +4532,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4892,35 +4647,30 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4931,7 +4681,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -4944,15 +4694,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4960,12 +4708,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4989,69 +4736,62 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5062,19 +4802,18 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5086,12 +4825,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -5113,10 +4851,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5126,7 +4863,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -5147,36 +4883,34 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5185,33 +4919,31 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5227,13 +4959,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5348,35 +5078,30 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5387,7 +5112,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5400,15 +5125,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5416,12 +5139,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5446,69 +5168,62 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5519,19 +5234,18 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5543,12 +5257,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -5570,10 +5283,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5583,7 +5295,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -5604,36 +5315,34 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5642,33 +5351,31 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5684,13 +5391,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5806,9 +5511,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5822,10 +5526,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5850,9 +5553,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5865,12 +5567,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5899,12 +5599,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -5926,9 +5624,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5942,10 +5639,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -5966,9 +5662,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5981,12 +5676,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -6007,7 +5700,6 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6019,7 +5711,6 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -6049,12 +5740,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -6082,12 +5771,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -6115,12 +5802,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -6230,37 +5915,33 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -6277,37 +5958,33 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -6324,30 +6001,28 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6357,38 +6032,34 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6400,37 +6071,34 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6442,31 +6110,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -6477,31 +6143,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6511,31 +6175,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6545,32 +6207,30 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6673,37 +6333,33 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -6720,37 +6376,33 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -6768,30 +6420,28 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB31_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6801,38 +6451,34 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6844,37 +6490,34 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6886,31 +6529,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -6921,31 +6562,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6955,31 +6594,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6989,32 +6626,30 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7118,7 +6753,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7126,9 +6760,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7153,17 +6785,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7183,36 +6813,33 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_max_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7220,9 +6847,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7243,17 +6868,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7274,14 +6897,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v4, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7301,27 +6922,25 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7329,14 +6948,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v4, v2 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -7357,19 +6974,17 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -7458,15 +7073,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7490,24 +7103,21 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -7520,24 +7130,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_max_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7548,15 +7156,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7576,25 +7182,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7606,23 +7209,21 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -7633,22 +7234,20 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7658,22 +7257,20 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v3, v4, v2 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7685,22 +7282,20 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_e32 v3, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7785,36 +7380,31 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7825,7 +7415,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7838,15 +7428,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7854,12 +7442,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -7884,69 +7471,62 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB34_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7957,19 +7537,18 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7981,12 +7560,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -8008,10 +7586,9 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8021,7 +7598,6 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -8042,39 +7618,37 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8083,33 +7657,31 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8125,13 +7697,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8246,38 +7816,34 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -8294,38 +7860,34 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -8342,30 +7904,28 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB35_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8375,38 +7935,34 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8418,37 +7974,34 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8460,31 +8013,29 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -8495,34 +8046,32 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8532,31 +8081,29 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8566,32 +8113,30 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9124,7 +8669,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 @@ -9168,7 +8712,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 @@ -9641,7 +9184,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 @@ -9686,7 +9228,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10162,7 +9703,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 @@ -10207,7 +9747,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10655,7 +10194,6 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 @@ -10698,7 +10236,6 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_max_f32_e32 v3, v3, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -11158,7 +10695,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -11202,7 +10738,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -11664,7 +11199,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -11708,7 +11242,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -12088,7 +11621,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -12125,7 +11657,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -12490,7 +12021,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -12525,7 +12055,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -13003,7 +12532,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 @@ -13048,7 +12576,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -13515,7 +13042,6 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -13559,7 +13085,6 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -13598,15 +13123,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -13625,43 +13148,38 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB46_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -13681,14 +13199,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13706,25 +13222,23 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13732,13 +13246,11 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13756,21 +13268,17 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -13880,15 +13388,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -13907,43 +13413,38 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB47_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -13963,14 +13464,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13988,25 +13487,23 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14014,13 +13511,11 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14040,21 +13535,17 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -14164,15 +13655,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -14191,43 +13680,38 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -14247,14 +13731,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -14272,25 +13754,23 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14298,13 +13778,11 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14324,21 +13802,17 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -14455,21 +13929,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -14482,23 +13953,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB49_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14508,22 +13976,19 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14535,21 +14000,19 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -14560,20 +14023,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14583,20 +14044,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14606,24 +14065,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14724,21 +14179,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -14751,23 +14203,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14777,22 +14226,19 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14804,21 +14250,19 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -14829,20 +14273,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14852,20 +14294,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14877,24 +14317,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14996,21 +14432,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -15023,23 +14456,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15049,22 +14479,19 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15076,21 +14503,19 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -15101,20 +14526,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15124,20 +14547,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15149,24 +14570,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15277,15 +14694,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -15305,43 +14720,38 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX942-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -15361,14 +14771,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -15386,28 +14794,26 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15415,13 +14821,11 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15441,21 +14845,17 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -15564,22 +14964,19 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -15592,23 +14989,20 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX942-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB53_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15618,22 +15012,19 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15645,21 +15036,19 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -15670,23 +15059,21 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15696,20 +15083,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15721,24 +15106,20 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index ea493405612d1..22155e2424ba6 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -32,27 +32,25 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -78,25 +76,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -104,13 +100,11 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -128,13 +122,11 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -194,27 +186,25 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -240,25 +230,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -266,13 +254,11 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -292,17 +278,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 @@ -358,27 +342,25 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -404,25 +386,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -430,13 +410,11 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -456,17 +434,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB2_1 @@ -522,22 +498,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -567,20 +541,18 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -590,20 +562,18 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -613,20 +583,18 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -678,22 +646,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -723,20 +689,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -746,20 +710,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -771,20 +733,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -837,22 +797,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -882,20 +840,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -905,20 +861,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -930,20 +884,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -997,27 +949,25 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1043,28 +993,26 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1072,13 +1020,11 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1098,17 +1044,15 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 @@ -1165,22 +1109,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX942-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1210,23 +1152,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1236,20 +1176,18 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1261,20 +1199,18 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1327,42 +1263,38 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1382,14 +1314,12 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1407,25 +1337,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1433,13 +1361,11 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1457,13 +1383,11 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1485,13 +1409,11 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc @@ -1515,14 +1437,12 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc @@ -1558,27 +1478,25 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1604,25 +1522,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1630,13 +1546,11 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1654,13 +1568,11 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1724,27 +1636,25 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1770,25 +1680,23 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1796,13 +1704,11 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1820,13 +1726,11 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1886,27 +1790,25 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1932,25 +1834,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1958,13 +1858,11 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1984,17 +1882,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 @@ -2050,27 +1946,25 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2096,25 +1990,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2122,13 +2014,11 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2148,17 +2038,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -2214,22 +2102,20 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2259,20 +2145,18 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2282,20 +2166,18 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2305,20 +2187,18 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2370,22 +2250,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2415,20 +2293,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2438,20 +2314,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2463,20 +2337,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2529,22 +2401,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2574,20 +2444,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2597,20 +2465,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2622,20 +2488,18 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2689,27 +2553,25 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2735,28 +2597,26 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2764,13 +2624,11 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2790,17 +2648,15 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -2857,22 +2713,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX942-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2902,23 +2756,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2928,20 +2780,18 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2953,20 +2803,18 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3015,15 +2863,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3053,15 +2899,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3099,15 +2943,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3125,15 +2967,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3188,15 +3028,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3226,15 +3064,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3272,15 +3108,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3300,15 +3134,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[8:9], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3362,15 +3194,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3400,15 +3230,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3446,15 +3274,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3474,15 +3300,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[8:9], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3535,21 +3359,18 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -3572,22 +3393,19 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3617,21 +3435,19 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3641,21 +3457,19 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3698,21 +3512,18 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -3735,22 +3546,19 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3780,21 +3588,19 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3806,21 +3612,19 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3864,21 +3668,18 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -3901,22 +3702,19 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3946,21 +3744,19 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3972,21 +3768,19 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4031,15 +3825,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4069,15 +3861,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4097,15 +3887,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4124,41 +3912,37 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX90A-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4176,15 +3960,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4202,30 +3984,30 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[8:9], v[4:5], s[4:7], 0 addr64 -; GFX7-NEXT: v_max_f64 v[10:11], v[2:3], v[2:3] +; GFX7-NEXT: buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[10:11] -; GFX7-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-NEXT: v_mov_b32_e32 v3, v9 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_min_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: v_mov_b32_e32 v11, v1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4236,30 +4018,31 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[8:9], v[4:5], s[4:7], 0 addr64 -; GFX6-NEXT: v_max_f64 v[10:11], v[2:3], v[2:3] +; GFX6-NEXT: buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX6-NEXT: v_min_f64 v[6:7], v[0:1], v[10:11] -; GFX6-NEXT: v_mov_b32_e32 v0, v6 -; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v3, v9 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_min_f64 v[8:9], v[10:11], v[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: v_mov_b32_e32 v11, v1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v8, v0 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4279,15 +4062,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4317,15 +4098,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4363,15 +4142,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4389,15 +4166,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[6:7], v[2:3] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4456,9 +4231,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4473,11 +4247,9 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4502,9 +4274,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4518,12 +4289,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4552,12 +4322,10 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -4580,9 +4348,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4597,11 +4364,9 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -4622,9 +4387,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4638,12 +4402,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -4664,7 +4427,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -4677,7 +4439,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -4707,12 +4468,10 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -4734,33 +4493,31 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -4775,13 +4532,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4892,35 +4647,30 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4931,7 +4681,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -4944,15 +4694,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4960,12 +4708,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4989,69 +4736,62 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5062,19 +4802,18 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5086,12 +4825,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -5113,10 +4851,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5126,7 +4863,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -5147,36 +4883,34 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5185,33 +4919,31 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5227,13 +4959,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5348,35 +5078,30 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5387,7 +5112,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5400,15 +5125,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5416,12 +5139,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5446,69 +5168,62 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5519,19 +5234,18 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5543,12 +5257,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -5570,10 +5283,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5583,7 +5295,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -5604,36 +5315,34 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5642,33 +5351,31 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5684,13 +5391,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5806,9 +5511,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5822,10 +5526,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5850,9 +5553,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5865,12 +5567,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5899,12 +5599,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -5926,9 +5624,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5942,10 +5639,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -5966,9 +5662,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5981,12 +5676,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -6007,7 +5700,6 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6019,7 +5711,6 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -6049,12 +5740,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 @@ -6082,12 +5771,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -6115,12 +5802,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -6230,37 +5915,33 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -6277,37 +5958,33 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -6324,30 +6001,28 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6357,38 +6032,34 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6400,37 +6071,34 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6442,31 +6110,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -6477,31 +6143,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6511,31 +6175,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6545,32 +6207,30 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6673,37 +6333,33 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -6720,37 +6376,33 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -6768,30 +6420,28 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB31_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6801,38 +6451,34 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6844,37 +6490,34 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6886,31 +6529,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -6921,31 +6562,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6955,31 +6594,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6989,32 +6626,30 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7118,7 +6753,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7126,9 +6760,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7153,17 +6785,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7183,36 +6813,33 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_min_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7220,9 +6847,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7243,17 +6868,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v4, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7274,14 +6897,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_min_f16_e32 v3, v4, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7301,27 +6922,25 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: v_min_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7329,14 +6948,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f16_e32 v3, v4, v2 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -7357,19 +6974,17 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f16_e32 v0, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -7458,15 +7073,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7490,24 +7103,21 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -7520,24 +7130,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_min_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7548,15 +7156,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7576,25 +7182,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7606,23 +7209,21 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_min_f16_e32 v3, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -7633,22 +7234,20 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: v_min_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7658,22 +7257,20 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX908-NEXT: v_min_f16_e32 v3, v4, v2 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7685,22 +7282,20 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f16_e32 v3, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7785,36 +7380,31 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7825,7 +7415,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7838,15 +7428,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7854,12 +7442,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -7884,69 +7471,62 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB34_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7957,19 +7537,18 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7981,12 +7560,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -8008,10 +7586,9 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8021,7 +7598,6 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -8042,39 +7618,37 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8083,33 +7657,31 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8125,13 +7697,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 ; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8246,38 +7816,34 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -8294,38 +7860,34 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -8342,30 +7904,28 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB35_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8375,38 +7935,34 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8418,37 +7974,34 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8460,31 +8013,29 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -8495,34 +8046,32 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8532,31 +8081,29 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8566,32 +8113,30 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9124,7 +8669,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 @@ -9168,7 +8712,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 @@ -9641,7 +9184,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 @@ -9686,7 +9228,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10162,7 +9703,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 @@ -10207,7 +9747,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10655,7 +10194,6 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 @@ -10698,7 +10236,6 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_min_f32_e32 v3, v3, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -11158,7 +10695,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -11202,7 +10738,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -11664,7 +11199,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -11708,7 +11242,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -12088,7 +11621,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -12125,7 +11657,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -12490,7 +12021,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -12525,7 +12055,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 @@ -13003,7 +12532,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 @@ -13048,7 +12576,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -13515,7 +13042,6 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 @@ -13559,7 +13085,6 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -13598,15 +13123,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -13625,43 +13148,38 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB46_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -13681,14 +13199,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13706,25 +13222,23 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13732,13 +13246,11 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13756,21 +13268,17 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -13880,15 +13388,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -13907,43 +13413,38 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB47_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -13963,14 +13464,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13988,25 +13487,23 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14014,13 +13511,11 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14040,21 +13535,17 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -14164,15 +13655,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -14191,43 +13680,38 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -14247,14 +13731,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -14272,25 +13754,23 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14298,13 +13778,11 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14324,21 +13802,17 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -14455,21 +13929,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -14482,23 +13953,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB49_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14508,22 +13976,19 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14535,21 +14000,19 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -14560,20 +14023,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14583,20 +14044,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14606,24 +14065,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14724,21 +14179,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -14751,23 +14203,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14777,22 +14226,19 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14804,21 +14250,19 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -14829,20 +14273,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14852,20 +14294,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14877,24 +14317,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14996,21 +14432,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -15023,23 +14456,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15049,22 +14479,19 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15076,21 +14503,19 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -15101,20 +14526,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15124,20 +14547,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15149,24 +14570,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15277,15 +14694,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -15305,43 +14720,38 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX942-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -15361,14 +14771,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -15386,28 +14794,26 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15415,13 +14821,11 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15441,21 +14845,17 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_min_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -15564,22 +14964,19 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -15592,23 +14989,20 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX942-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB53_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15618,22 +15012,19 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15645,21 +15036,19 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -15670,23 +15059,21 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15696,20 +15083,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_min_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15721,24 +15106,20 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 2160976599dd7..69208049f3ae0 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -34,8 +34,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 @@ -66,8 +65,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -159,8 +157,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 @@ -191,8 +188,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -305,14 +301,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v2, s4, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -328,12 +322,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 @@ -381,12 +373,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX9-NEXT: v_max_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -399,14 +389,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -451,12 +439,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1064-NEXT: v_max_f32_e32 v1, s3, v1 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -507,12 +493,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-NEXT: v_max_f32_e32 v1, s2, v1 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -551,16 +535,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1164-NEXT: v_max_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -601,15 +582,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-NEXT: v_max_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -658,22 +637,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, v2, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -716,32 +693,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -752,14 +722,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, s4, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -804,31 +772,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 -; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -877,22 +837,16 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -931,35 +885,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -1001,31 +945,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_dual_max_f32 v1, v2, v1 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1066,8 +1003,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 @@ -1098,8 +1034,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1191,8 +1126,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 @@ -1223,8 +1157,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1338,14 +1271,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v2, s4, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -1361,12 +1292,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 @@ -1414,12 +1343,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX9-NEXT: v_max_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1432,14 +1359,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1484,12 +1409,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1064-NEXT: v_max_f32_e32 v1, s3, v1 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1540,12 +1463,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-NEXT: v_max_f32_e32 v1, s2, v1 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1584,16 +1505,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1164-NEXT: v_max_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1634,15 +1552,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-NEXT: v_max_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1691,22 +1607,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, v2, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1749,32 +1663,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -1785,14 +1692,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, s4, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1837,31 +1742,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 -; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -1910,22 +1807,16 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1964,35 +1855,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -2034,31 +1915,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_dual_max_f32 v1, v2, v1 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2100,8 +1974,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 @@ -2132,8 +2005,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2225,8 +2097,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 @@ -2257,8 +2128,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2371,14 +2241,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v2, s4, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -2394,12 +2262,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 @@ -2447,12 +2313,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX9-NEXT: v_max_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2465,14 +2329,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2517,12 +2379,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1064-NEXT: v_max_f32_e32 v1, s3, v1 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2573,12 +2433,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-NEXT: v_max_f32_e32 v1, s2, v1 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2617,16 +2475,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1164-NEXT: v_max_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2667,15 +2522,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-NEXT: v_max_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2724,22 +2577,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, v2, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2782,32 +2633,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -2818,14 +2662,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, s4, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2870,31 +2712,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 -; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -2943,22 +2777,16 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2997,35 +2825,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -3067,31 +2885,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_dual_max_f32 v1, v2, v1 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -3133,8 +2944,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 @@ -3169,8 +2979,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3234,9 +3043,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3266,9 +3074,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -3299,8 +3106,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 @@ -3335,8 +3141,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3400,9 +3205,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3432,9 +3236,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -3486,15 +3289,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -3510,12 +3311,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 @@ -3569,11 +3368,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3585,15 +3382,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3640,13 +3435,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 -; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3698,13 +3491,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], s[2:3] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3744,17 +3535,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3767,16 +3555,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3813,16 +3598,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3835,16 +3618,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -3888,25 +3667,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3950,41 +3727,34 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -4005,10 +3775,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] ; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] -; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], s[0:1] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] @@ -4056,38 +3824,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -4140,30 +3900,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 @@ -4207,45 +3961,35 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) @@ -4261,17 +4005,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[0:1] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -4310,58 +4050,46 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[0:1] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -4397,8 +4125,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 @@ -4433,8 +4160,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4498,9 +4224,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4530,9 +4255,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -4563,8 +4287,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 @@ -4599,8 +4322,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4664,9 +4386,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4696,9 +4417,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -4750,15 +4470,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -4774,12 +4492,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 @@ -4833,11 +4549,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4849,15 +4563,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4904,13 +4616,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 -; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4962,13 +4672,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], s[2:3] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5008,17 +4716,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5031,16 +4736,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5077,16 +4779,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5099,16 +4799,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -5152,25 +4848,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5214,41 +4908,34 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5269,10 +4956,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] -; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], s[0:1] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] @@ -5320,38 +5005,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -5404,30 +5081,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 @@ -5471,45 +5142,35 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) @@ -5525,17 +5186,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[0:1] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -5574,58 +5231,46 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[0:1] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -5661,8 +5306,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 @@ -5697,8 +5341,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5762,9 +5405,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5794,9 +5436,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -5827,8 +5468,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 @@ -5863,8 +5503,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5928,9 +5567,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5960,9 +5598,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -6014,15 +5651,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -6038,12 +5673,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 @@ -6097,11 +5730,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6113,15 +5744,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6168,13 +5797,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 -; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6226,13 +5853,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], s[2:3] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6272,17 +5897,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6295,16 +5917,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6341,16 +5960,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6363,16 +5980,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -6416,25 +6029,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6478,41 +6089,34 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -6533,10 +6137,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] ; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] -; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], s[0:1] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] @@ -6584,38 +6186,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -6668,30 +6262,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 @@ -6735,45 +6323,35 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) @@ -6789,17 +6367,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[0:1] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -6838,58 +6412,46 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[0:1] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -6924,8 +6486,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 @@ -6956,8 +6517,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -7049,8 +6609,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 @@ -7081,8 +6640,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -7178,8 +6736,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 @@ -7210,8 +6767,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -7303,8 +6859,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 @@ -7335,8 +6890,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 029fb9c118344..7d2a52ad31a6a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -34,8 +34,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 @@ -66,8 +65,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -159,8 +157,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 @@ -191,8 +188,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -305,14 +301,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v2, s4, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -328,12 +322,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 @@ -381,12 +373,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX9-NEXT: v_min_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -399,14 +389,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -451,12 +439,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1064-NEXT: v_min_f32_e32 v1, s3, v1 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -507,12 +493,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-NEXT: v_min_f32_e32 v1, s2, v1 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -551,16 +535,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1164-NEXT: v_min_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -601,15 +582,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-NEXT: v_min_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -658,22 +637,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, v2, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -716,32 +693,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -752,14 +722,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, s4, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -804,31 +772,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 -; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_min_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -877,22 +837,16 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -931,35 +885,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -1001,31 +945,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_dual_min_f32 v1, v2, v1 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1066,8 +1003,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 @@ -1098,8 +1034,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1191,8 +1126,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 @@ -1223,8 +1157,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1338,14 +1271,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v2, s4, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -1361,12 +1292,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 @@ -1414,12 +1343,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX9-NEXT: v_min_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1432,14 +1359,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1484,12 +1409,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1064-NEXT: v_min_f32_e32 v1, s3, v1 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1540,12 +1463,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-NEXT: v_min_f32_e32 v1, s2, v1 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1584,16 +1505,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1164-NEXT: v_min_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1634,15 +1552,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-NEXT: v_min_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1691,22 +1607,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, v2, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1749,32 +1663,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -1785,14 +1692,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, s4, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1837,31 +1742,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 -; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_min_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -1910,22 +1807,16 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1964,35 +1855,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -2034,31 +1915,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_dual_min_f32 v1, v2, v1 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2100,8 +1974,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 @@ -2132,8 +2005,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2225,8 +2097,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 @@ -2257,8 +2128,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2371,14 +2241,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v2, s4, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -2394,12 +2262,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 @@ -2447,12 +2313,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX9-NEXT: v_min_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2465,14 +2329,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_min_f32_e32 v0, v1, v2 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2517,12 +2379,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1064-NEXT: v_min_f32_e32 v1, s3, v1 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2573,12 +2433,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-NEXT: v_min_f32_e32 v1, s2, v1 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2617,16 +2475,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 +; GFX1164-NEXT: v_min_f32_e32 v1, s3, v1 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2667,15 +2522,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 -; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-NEXT: v_min_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2724,22 +2577,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, v2, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2782,32 +2633,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5 ; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v5 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -2818,14 +2662,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, s4, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2870,31 +2712,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 -; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_min_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 @@ -2943,22 +2777,16 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2997,35 +2825,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -3067,31 +2885,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_dual_min_f32 v1, v2, v1 :: v_dual_mov_b32 v2, 0x7fc00000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -3133,8 +2944,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 @@ -3169,8 +2979,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3234,9 +3043,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3266,9 +3074,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -3299,8 +3106,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 @@ -3335,8 +3141,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3400,9 +3205,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3432,9 +3236,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -3486,15 +3289,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -3510,12 +3311,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 @@ -3569,11 +3368,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3585,15 +3382,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3640,13 +3435,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 -; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3698,13 +3491,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], s[2:3] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3744,17 +3535,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3767,16 +3555,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3813,16 +3598,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3835,16 +3618,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -3888,25 +3667,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3950,41 +3727,34 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -4005,10 +3775,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] ; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] -; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[11:12], s[0:1] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] @@ -4056,38 +3824,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] -; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -4140,30 +3900,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 @@ -4207,45 +3961,35 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) @@ -4261,17 +4005,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[8:9], v[0:1] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -4310,58 +4050,46 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[8:9], v[0:1] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -4397,8 +4125,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 @@ -4433,8 +4160,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4498,9 +4224,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4530,9 +4255,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -4563,8 +4287,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 @@ -4599,8 +4322,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4664,9 +4386,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4696,9 +4417,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -4750,15 +4470,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -4774,12 +4492,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 @@ -4833,11 +4549,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4849,15 +4563,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4904,13 +4616,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 -; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4962,13 +4672,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], s[2:3] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5008,17 +4716,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5031,16 +4736,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5077,16 +4779,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5099,16 +4799,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -5152,25 +4848,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5214,41 +4908,34 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5269,10 +4956,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] -; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[11:12], s[0:1] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] @@ -5320,38 +5005,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] -; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -5404,30 +5081,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 @@ -5471,45 +5142,35 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) @@ -5525,17 +5186,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[8:9], v[0:1] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -5574,58 +5231,46 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[8:9], v[0:1] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -5661,8 +5306,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 @@ -5697,8 +5341,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5762,9 +5405,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5794,9 +5436,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -5827,8 +5468,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 @@ -5863,8 +5503,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5928,9 +5567,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5960,9 +5598,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -6014,15 +5651,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -6038,12 +5673,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 @@ -6097,11 +5730,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6113,15 +5744,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6168,13 +5797,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 -; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6226,13 +5853,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], s[2:3] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6272,17 +5897,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6295,16 +5917,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6341,16 +5960,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] -; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: v_min_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6363,16 +5980,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -6416,25 +6029,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6478,41 +6089,34 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_nop 0 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] ; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -6533,10 +6137,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] ; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] -; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[11:12], s[0:1] ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] @@ -6584,38 +6186,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] -; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] -; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], s[2:3], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -6668,30 +6262,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v4, 0, 0 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 -; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 @@ -6735,45 +6323,35 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) @@ -6789,17 +6367,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[8:9], v[0:1] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -6838,58 +6412,46 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 0x7ff80000 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v4, v2 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v6 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[8:9], v[0:1] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -6924,8 +6486,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 @@ -6956,8 +6517,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -7049,8 +6609,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 @@ -7081,8 +6640,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -7178,8 +6736,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 @@ -7210,8 +6767,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -7303,8 +6859,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 @@ -7335,8 +6890,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v1 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll index 078f3014fdc9b..b05d4c1794965 100644 --- a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll +++ b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll @@ -114,7 +114,6 @@ define float @v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32(float ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -130,7 +129,6 @@ define float @v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32(float ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -146,7 +144,6 @@ define float @v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -162,7 +159,6 @@ define float @v_minnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -195,7 +191,6 @@ define float @v_maxnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -211,7 +206,6 @@ define float @v_maxnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -248,8 +242,8 @@ define float @v_select_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %b.nnan.add = fadd nnan float %b, 1.0 %cmp = icmp eq i32 %c, 0 @@ -266,8 +260,8 @@ define float @v_select_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %cmp = icmp eq i32 %c, 0 @@ -527,7 +521,6 @@ define float @v_test_known_not_snan_fmed3_input_fmed3_r_i_i_f32(float %a, float ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_med3_f32 v0, v0, v1, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %known.not.snan = call float @llvm.amdgcn.fmed3.f32(float %a, float %b, float %c) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 2da2aa182971c..2ded050c8f1bc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -57,8 +57,6 @@ define amdgpu_kernel void @maxnum_f16( ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -82,8 +80,6 @@ define amdgpu_kernel void @maxnum_f16( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -108,8 +104,6 @@ define amdgpu_kernel void @maxnum_f16( ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -134,9 +128,6 @@ define amdgpu_kernel void @maxnum_f16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_endpgm @@ -161,9 +152,6 @@ define amdgpu_kernel void @maxnum_f16( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-FAKE16-NEXT: s_endpgm @@ -213,7 +201,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -232,7 +219,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -251,7 +237,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -270,8 +255,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, 0x4200, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm @@ -290,8 +273,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, 0x4200, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm @@ -339,7 +320,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -358,7 +338,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -377,7 +356,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -396,8 +374,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, 4.0, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm @@ -416,8 +392,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, 4.0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm @@ -468,13 +442,12 @@ define amdgpu_kernel void @maxnum_v2f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_max_f16_e32 v0, s2, v0 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -492,44 +465,38 @@ define amdgpu_kernel void @maxnum_v2f16( ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s11, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, s4, s5 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v0, s2, s4 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -567,33 +534,36 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; VI-LABEL: maxnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x4200 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s4, s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s4, s4 -; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_max_f16_e32 v0, s2, v0 ; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x44004200 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x44004200 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16_imm_a: @@ -603,9 +573,8 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, s2 ; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; @@ -616,10 +585,8 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v0, 0x44004200, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, 0x44004200, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -655,17 +622,16 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; VI-LABEL: maxnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0x4200 +; VI-NEXT: v_mov_b32_e32 v1, 0x4200 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: v_max_f16_e64 v0, s4, 4.0 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 -; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -673,15 +639,16 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; GFX9-LABEL: maxnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42004400 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x42004400 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16_imm_b: @@ -691,9 +658,8 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, s2 ; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; @@ -704,10 +670,8 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v0, 0x42004400, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, 0x42004400, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -763,18 +727,16 @@ define amdgpu_kernel void @maxnum_v3f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_max_f16_e32 v0, s2, v0 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s9, s9 -; VI-NEXT: v_max_f16_e64 v2, s3, s3 -; VI-NEXT: v_max_f16_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_max_f16_e32 v1, s3, v1 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -786,17 +748,16 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 -; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_pk_max_f16 v1, s3, v1 +; GFX9-NEXT: v_pk_max_f16 v0, s2, v0 ; GFX9-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -804,45 +765,36 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX10-LABEL: maxnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_max_f16 v1, v2, v1 -; GFX10-NEXT: v_pk_max_f16 v0, v3, v0 -; GFX10-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: v_pk_max_f16 v0, s5, s9 +; GFX10-NEXT: v_pk_max_f16 v1, s4, s8 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 +; GFX11-NEXT: s_load_b64 s[6:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_max_f16 v1, v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_max_f16 v0, v3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, s7, s5 +; GFX11-NEXT: v_pk_max_f16 v1, s6, s4 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -907,22 +859,20 @@ define amdgpu_kernel void @maxnum_v4f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s9, s9 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 ; VI-NEXT: s_lshr_b32 s0, s9, 16 -; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_max_f16_e32 v0, s3, v0 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_max_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_max_f16_e32 v0, s2, v0 ; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -935,58 +885,48 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 -; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_pk_max_f16 v1, s3, v0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v2, s4, s4 -; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v1, s5, s9 +; GFX10-NEXT: v_pk_max_f16 v0, s4, s8 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 +; GFX11-NEXT: s_load_b64 s[6:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_max_f16 v0, v3, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v1, s7, s5 +; GFX11-NEXT: v_pk_max_f16 v0, s6, s4 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1033,24 +973,24 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; VI-LABEL: fmax_v4f16_imm_a: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0x4400 +; VI-NEXT: v_mov_b32_e32 v0, 0x4200 +; VI-NEXT: v_mov_b32_e32 v1, 0x4400 +; VI-NEXT: v_mov_b32_e32 v2, 0x4800 ; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 -; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1 -; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_max_f16_e32 v0, s3, v0 +; VI-NEXT: v_max_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_or_b32_e32 v1, v1, v0 -; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_max_f16_e32 v0, s2, v2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1060,8 +1000,8 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; GFX9-LABEL: fmax_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s8, 0x44004200 -; GFX9-NEXT: s_mov_b32 s9, 0x40004800 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x44004200 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x40004800 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1069,10 +1009,8 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 -; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 -; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 -; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 +; GFX9-NEXT: v_pk_max_f16 v1, s3, v0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -1080,14 +1018,12 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 -; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, v0 -; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, s5 +; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, s4 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; @@ -1095,15 +1031,12 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_max_f16 v1, 0x44004200, v0 -; GFX11-NEXT: v_pk_max_f16 v0, 0x40004800, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v1, 0x44004200, s5 +; GFX11-NEXT: v_pk_max_f16 v0, 0x40004800, s4 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 6b40024d3af01..d8891b6864c0f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -57,8 +57,6 @@ define amdgpu_kernel void @minnum_f16_ieee( ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -82,8 +80,6 @@ define amdgpu_kernel void @minnum_f16_ieee( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -108,8 +104,6 @@ define amdgpu_kernel void @minnum_f16_ieee( ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -134,8 +128,6 @@ define amdgpu_kernel void @minnum_f16_ieee( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h ; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_endpgm @@ -160,8 +152,6 @@ define amdgpu_kernel void @minnum_f16_ieee( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-FAKE16-NEXT: s_endpgm @@ -248,7 +238,6 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -267,7 +256,6 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -286,7 +274,6 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -305,7 +292,6 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, 0x4200, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm @@ -324,7 +310,6 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, 0x4200, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm @@ -372,7 +357,6 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -391,7 +375,6 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -410,7 +393,6 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -429,7 +411,6 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; GFX11-TRUE16-NEXT: buffer_load_d16_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, 4.0, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm @@ -448,7 +429,6 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, 4.0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm @@ -499,13 +479,12 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_min_f16_e32 v0, s2, v0 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -523,43 +502,38 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_pk_min_f16 v0, s11, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_ieee: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX10-NEXT: v_pk_min_f16 v0, s4, s5 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v2f16_ieee: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 +; GFX11-NEXT: v_pk_min_f16 v0, s2, s4 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -634,33 +608,36 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; VI-LABEL: minnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x4200 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s4, s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s4, s4 -; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_min_f16_e32 v0, s2, v0 ; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x44004200 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x44004200 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_min_f16 v0, s2, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_imm_a: @@ -670,9 +647,8 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, s2 ; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; @@ -683,9 +659,8 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_min_f16 v0, 0x44004200, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v0, 0x44004200, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -721,17 +696,16 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; VI-LABEL: minnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0x4200 +; VI-NEXT: v_mov_b32_e32 v1, 0x4200 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: v_min_f16_e64 v0, s4, 4.0 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: v_max_f16_e64 v1, s4, s4 -; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 -; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -739,15 +713,16 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; GFX9-LABEL: minnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42004400 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x42004400 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_min_f16 v0, s2, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_imm_b: @@ -757,9 +732,8 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, s2 ; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; @@ -770,9 +744,8 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_min_f16 v0, 0x42004400, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v0, 0x42004400, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -828,18 +801,16 @@ define amdgpu_kernel void @minnum_v3f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_min_f16_e32 v0, s2, v0 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s9, s9 -; VI-NEXT: v_max_f16_e64 v2, s3, s3 -; VI-NEXT: v_min_f16_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_min_f16_e32 v1, s3, v1 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -851,17 +822,16 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 -; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 -; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_pk_min_f16 v1, s3, v1 +; GFX9-NEXT: v_pk_min_f16 v0, s2, v0 ; GFX9-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -869,44 +839,36 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX10-LABEL: minnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_min_f16 v1, v2, v1 -; GFX10-NEXT: v_pk_min_f16 v0, v3, v0 -; GFX10-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: v_pk_min_f16 v0, s5, s9 +; GFX10-NEXT: v_pk_min_f16 v1, s4, s8 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 +; GFX11-NEXT: s_load_b64 s[6:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v1, v2, v1 -; GFX11-NEXT: v_pk_min_f16 v0, v3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_min_f16 v0, s7, s5 +; GFX11-NEXT: v_pk_min_f16 v1, s6, s4 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -971,22 +933,20 @@ define amdgpu_kernel void @minnum_v4f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s9, s9 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 ; VI-NEXT: s_lshr_b32 s0, s9, 16 -; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_min_f16_e32 v0, s3, v0 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_min_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_min_f16_e32 v0, s2, v0 ; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -999,57 +959,48 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 -; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_pk_min_f16 v1, s3, v0 +; GFX9-NEXT: v_pk_min_f16 v0, s2, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v2, s4, s4 -; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 +; GFX10-NEXT: v_pk_min_f16 v1, s5, s9 +; GFX10-NEXT: v_pk_min_f16 v0, s4, s8 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 +; GFX11-NEXT: s_load_b64 s[6:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX11-NEXT: v_pk_min_f16 v0, v3, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_min_f16 v1, s7, s5 +; GFX11-NEXT: v_pk_min_f16 v0, s6, s4 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1096,24 +1047,24 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; VI-LABEL: fmin_v4f16_imm_a: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0x4400 +; VI-NEXT: v_mov_b32_e32 v0, 0x4200 +; VI-NEXT: v_mov_b32_e32 v1, 0x4400 +; VI-NEXT: v_mov_b32_e32 v2, 0x4800 ; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 -; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1 -; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_min_f16_e32 v0, s3, v0 +; VI-NEXT: v_min_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_or_b32_e32 v1, v1, v0 -; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_min_f16_e32 v0, s2, v2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1123,8 +1074,8 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; GFX9-LABEL: fmin_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s8, 0x44004200 -; GFX9-NEXT: s_mov_b32 s9, 0x40004800 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x44004200 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x40004800 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1132,10 +1083,8 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 -; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 -; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 -; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 +; GFX9-NEXT: v_pk_min_f16 v1, s3, v0 +; GFX9-NEXT: v_pk_min_f16 v0, s2, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -1143,14 +1092,12 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 -; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0 -; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, s5 +; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, s4 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; @@ -1158,14 +1105,12 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v1, 0x44004200, v0 -; GFX11-NEXT: v_pk_min_f16 v0, 0x40004800, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_min_f16 v1, 0x44004200, s5 +; GFX11-NEXT: v_pk_min_f16 v0, 0x40004800, s4 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 09bdbb28ba2a1..fff9186588c1e 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -798,11 +798,9 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -829,29 +827,28 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, 4.0, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -859,7 +856,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f16: @@ -877,7 +874,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -911,11 +907,9 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -937,36 +931,35 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, 4.0, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_f16: @@ -984,7 +977,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v2, 4.0, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 @@ -1016,7 +1008,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1036,31 +1027,30 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v3, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX908-NEXT: ds_read_b32 v2, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_and_b32_e32 v0, 24, v2 -; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 -; GFX908-NEXT: v_not_b32_e32 v2, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 24, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v3, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX908-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX908-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_ret_f16: @@ -1080,7 +1070,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_max_f16_e32 v3, 4.0, v3 ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 @@ -1190,11 +1179,9 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1234,12 +1221,11 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1272,7 +1258,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1308,11 +1293,9 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1347,12 +1330,11 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1375,30 +1357,29 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX10-NEXT: ds_read_b32 v3, v0 +; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v1, v4 +; GFX10-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1417,7 +1398,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1438,31 +1418,30 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v0, 0xfffe, v0 ; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v3, v1 +; GFX908-NEXT: ds_read_b32 v2, v1 ; GFX908-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX908-NEXT: v_not_b32_e32 v2, v2 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v0, s4 +; GFX908-NEXT: v_not_b32_e32 v3, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX908-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX908-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1483,7 +1462,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_max_f16_e32 v3, 4.0, v3 ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 @@ -1593,11 +1571,9 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1634,12 +1610,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1671,7 +1646,6 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1703,11 +1677,9 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1739,12 +1711,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1774,7 +1745,6 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX10-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 @@ -1806,7 +1776,6 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1836,7 +1805,6 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX908-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1867,7 +1835,6 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX8-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 @@ -1974,10 +1941,9 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2016,12 +1982,10 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2054,7 +2018,6 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -2088,10 +2051,9 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2125,12 +2087,10 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2161,7 +2121,6 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX10-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -2194,7 +2153,6 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -2225,7 +2183,6 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX908-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -2257,7 +2214,6 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX8-NEXT: v_max_f16_e32 v4, 4.0, v4 ; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 @@ -2359,9 +2315,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v2.l ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2392,10 +2346,9 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, 4.0, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, 4.0, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2421,8 +2374,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v2 ; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -2447,9 +2399,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v2.l ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2475,10 +2425,9 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, 4.0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, 4.0, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2503,8 +2452,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX10-NEXT: v_max_f16_e32 v1, 4.0, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2529,8 +2477,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_max_f16_e32 v1, 4.0, v2 ; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -2554,8 +2501,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX908-NEXT: v_max_f16_e32 v1, 4.0, v2 ; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -2578,9 +2524,8 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v1, 4.0, v2 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_max_f16_e32 v1, 4.0, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2662,10 +2607,9 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v1.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2694,11 +2638,9 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, 4.0, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, 4.0, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2724,8 +2666,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX942-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX942-NEXT: v_max_f16_e32 v2, 4.0, v1 ; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -2746,10 +2687,9 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2773,11 +2713,9 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, 4.0, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, 4.0, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2801,8 +2739,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX10-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX10-NEXT: v_max_f16_e32 v2, 4.0, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2827,8 +2764,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX90A-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, 4.0, v1 ; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -2850,8 +2786,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX908-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX908-NEXT: v_max_f16_e32 v2, 4.0, v1 ; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -2873,9 +2808,8 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v2, 4.0, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_max_f16_e32 v2, 4.0, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3345,7 +3279,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_mov_b32_e32 v4, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 @@ -3379,7 +3312,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_mov_b32_e32 v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 @@ -3811,7 +3743,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_mov_b32_e32 v4, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 @@ -3846,7 +3777,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_mov_b32_e32 v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 @@ -4255,7 +4185,6 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 @@ -4288,7 +4217,6 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 @@ -4706,7 +4634,6 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 @@ -4740,7 +4667,6 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 @@ -5089,7 +5015,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -5117,7 +5042,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_max_f32_e32 v0, 4.0, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -5450,7 +5374,6 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -5477,7 +5400,6 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -5510,15 +5432,13 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5539,38 +5459,33 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v3, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5589,14 +5504,12 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5615,22 +5528,20 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_v2f16: @@ -5638,13 +5549,11 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -5662,20 +5571,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 @@ -5773,15 +5678,13 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5802,38 +5705,33 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v2, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5852,14 +5750,12 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5878,22 +5774,20 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_v2f16__offset: @@ -5901,13 +5795,11 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX908-NEXT: v_pk_max_f16 v2, v3, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -5925,20 +5817,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_max_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 @@ -6037,14 +5925,11 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6065,13 +5950,10 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v1 ; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6087,14 +5969,11 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -6113,13 +5992,11 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -6138,12 +6015,10 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6160,12 +6035,10 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6181,23 +6054,19 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6286,14 +6155,11 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6314,13 +6180,10 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v1 ; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6336,14 +6199,11 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -6362,13 +6222,11 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -6387,12 +6245,10 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6409,12 +6265,10 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6430,23 +6284,19 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 3bb98a2a690ed..833baf715dfb6 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -798,11 +798,9 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -829,29 +827,28 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, 4.0, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -859,7 +856,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f16: @@ -877,7 +874,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -911,11 +907,9 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -937,36 +931,35 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 ; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, 4.0, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_f16: @@ -984,7 +977,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: v_min_f16_e32 v2, 4.0, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 @@ -1016,7 +1008,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1036,31 +1027,30 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v3, v1 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX908-NEXT: ds_read_b32 v2, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_and_b32_e32 v0, 24, v2 -; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 -; GFX908-NEXT: v_not_b32_e32 v2, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 24, v3 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v3, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX908-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX908-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_ret_f16: @@ -1080,7 +1070,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_min_f16_e32 v3, 4.0, v3 ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 @@ -1190,11 +1179,9 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1234,12 +1221,11 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1272,7 +1258,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1308,11 +1293,9 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1347,12 +1330,11 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1375,30 +1357,29 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX10-NEXT: ds_read_b32 v3, v0 +; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX10-NEXT: v_not_b32_e32 v2, v2 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff +; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v1, v4 +; GFX10-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1417,7 +1398,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1438,31 +1418,30 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v0, 0xfffe, v0 ; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX908-NEXT: ds_read_b32 v3, v1 +; GFX908-NEXT: ds_read_b32 v2, v1 ; GFX908-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX908-NEXT: v_not_b32_e32 v2, v2 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v0, s4 +; GFX908-NEXT: v_not_b32_e32 v3, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX908-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX908-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1483,7 +1462,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_min_f16_e32 v3, 4.0, v3 ; GFX8-NEXT: v_and_b32_e32 v5, v4, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3 @@ -1593,11 +1571,9 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1634,12 +1610,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1671,7 +1646,6 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1703,11 +1677,9 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1739,12 +1711,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1774,7 +1745,6 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX10-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4 @@ -1806,7 +1776,6 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1836,7 +1805,6 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX908-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -1867,7 +1835,6 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX8-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4 @@ -1974,10 +1941,9 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2016,12 +1982,10 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2054,7 +2018,6 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -2088,10 +2051,9 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2125,12 +2087,10 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2161,7 +2121,6 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX10-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -2194,7 +2153,6 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -2225,7 +2183,6 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX908-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 @@ -2257,7 +2214,6 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX8-NEXT: v_min_f16_e32 v4, 4.0, v4 ; GFX8-NEXT: v_and_b32_e32 v5, v3, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4 @@ -2359,9 +2315,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v2.l ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2392,10 +2346,9 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, 4.0, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, 4.0, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2421,8 +2374,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v2 ; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -2447,9 +2399,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v2.l ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2475,10 +2425,9 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, 4.0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, 4.0, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2503,8 +2452,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX10-NEXT: v_min_f16_e32 v1, 4.0, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2529,8 +2477,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX90A-NEXT: v_min_f16_e32 v1, 4.0, v2 ; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -2554,8 +2501,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX908-NEXT: v_min_f16_e32 v1, 4.0, v2 ; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -2578,9 +2524,8 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX8-NEXT: v_min_f16_e32 v1, 4.0, v2 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX8-NEXT: v_min_f16_e32 v1, 4.0, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2662,10 +2607,9 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v1.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2694,11 +2638,9 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, 4.0, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, 4.0, v2 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2724,8 +2666,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX942-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX942-NEXT: v_min_f16_e32 v2, 4.0, v1 ; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2 ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -2746,10 +2687,9 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2773,11 +2713,9 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, 4.0, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, 4.0, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2801,8 +2739,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX10-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX10-NEXT: v_min_f16_e32 v2, 4.0, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2827,8 +2764,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX90A-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, 4.0, v1 ; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -2850,8 +2786,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX908-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX908-NEXT: v_min_f16_e32 v2, 4.0, v1 ; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -2873,9 +2808,8 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_min_f16_e32 v2, 4.0, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_min_f16_e32 v2, 4.0, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3345,7 +3279,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_mov_b32_e32 v4, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 @@ -3379,7 +3312,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: v_mov_b32_e32 v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 @@ -3811,7 +3743,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: v_mov_b32_e32 v4, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v5, v4, v2 @@ -3846,7 +3777,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: v_mov_b32_e32 v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 @@ -4255,7 +4185,6 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 @@ -4288,7 +4217,6 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 @@ -4706,7 +4634,6 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v3, v2 @@ -4740,7 +4667,6 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v3, v2 @@ -5089,7 +5015,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -5117,7 +5042,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, 4.0, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -5450,7 +5374,6 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -5477,7 +5400,6 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -5510,15 +5432,13 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5539,38 +5459,33 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v3, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX942-NEXT: v_pk_min_f16 v2, v3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v2, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5589,14 +5504,12 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_min_f16 v2, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5615,22 +5528,20 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 +; GFX90A-NEXT: v_pk_min_f16 v2, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_v2f16: @@ -5638,13 +5549,11 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX908-NEXT: v_pk_min_f16 v2, v3, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -5662,20 +5571,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_min_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 @@ -5773,15 +5678,13 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5802,38 +5705,33 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX942-NEXT: v_pk_min_f16 v2, v3, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v2, v3, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5852,14 +5750,12 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_min_f16 v2, v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5878,22 +5774,20 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 +; GFX90A-NEXT: v_pk_min_f16 v2, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_v2f16__offset: @@ -5901,13 +5795,11 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX908-NEXT: v_pk_min_f16 v2, v3, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -5925,20 +5817,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_min_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v4, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 @@ -6037,14 +5925,11 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_min_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6065,13 +5950,10 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX942-NEXT: v_pk_min_f16 v3, v2, v1 ; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6087,14 +5969,11 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_min_f16 v3, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -6113,13 +5992,11 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_min_f16 v3, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -6138,12 +6015,10 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_min_f16 v3, v2, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6160,12 +6035,10 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_min_f16 v3, v2, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6181,23 +6054,19 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6286,14 +6155,11 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_min_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6314,13 +6180,10 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX942-NEXT: v_pk_min_f16 v3, v2, v1 ; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6336,14 +6199,11 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_min_f16 v3, v2, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -6362,13 +6222,11 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_min_f16 v3, v2, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -6387,12 +6245,10 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_min_f16 v3, v2, v1 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6409,12 +6265,10 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_min_f16 v3, v2, v1 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6430,23 +6284,19 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_min_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v4, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 689263d21f2cc..d6d4dca29e8e6 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -1075,7 +1075,6 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1086,63 +1085,53 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX1100-FAKE16: ; %bb.0: ; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v6 ; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: -; SDAG-GFX900: ; %bb.0: -; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3 -; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: -; SDAG-GFX906: ; %bb.0: -; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3 -; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; GFX906-NEXT: v_mov_b32_e32 v0, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-VI-NEXT: v_mac_f32_e32 v8, v6, v7 ; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp ; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0 -; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v1 -; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v2 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0x3c00 -; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v3 -; SDAG-VI-NEXT: v_min_f16_e32 v1, 1.0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp ; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1201,26 +1190,6 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v6 ; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: -; GISEL-GFX900: ; %bb.0: -; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: -; GISEL-GFX906: ; %bb.0: -; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GISEL-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1344,35 +1313,26 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-VI-NEXT: v_mac_f32_e32 v10, v7, v9 ; SDAG-VI-NEXT: v_mac_f32_e32 v11, v6, v8 -; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 ; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0 -; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v1 -; SDAG-VI-NEXT: v_max_f16_e32 v2, 0, v2 -; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v3 -; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0x3c00 -; SDAG-VI-NEXT: v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; SDAG-VI-NEXT: v_min_f16_e32 v3, 1.0, v3 -; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v11 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v1, v10 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp ; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-VI-NEXT: v_or_b32_e32 v1, v3, v1 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index dacee9a0173c5..dd9711157efa9 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -444,8 +444,6 @@ define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) { ; SDAG-GFX11-LABEL: test_minmax_f32_ieee_true: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; SDAG-GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; SDAG-GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -464,8 +462,6 @@ define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) { ; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 ; SDAG-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 ; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -485,8 +481,6 @@ define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) { ; SDAG-GFX1250: ; %bb.0: ; SDAG-GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; SDAG-GFX1250-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; SDAG-GFX1250-NEXT: v_max_num_f32_e32 v2, v2, v2 ; SDAG-GFX1250-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 ; SDAG-GFX1250-NEXT: s_set_pc_i64 s[30:31] ; @@ -594,8 +588,6 @@ define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) { ; SDAG-GFX11-LABEL: test_maxmin_f32_ieee_true: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; SDAG-GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; SDAG-GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -614,8 +606,6 @@ define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) { ; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 ; SDAG-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 ; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -635,8 +625,6 @@ define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) { ; SDAG-GFX1250: ; %bb.0: ; SDAG-GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; SDAG-GFX1250-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; SDAG-GFX1250-NEXT: v_max_num_f32_e32 v2, v2, v2 ; SDAG-GFX1250-NEXT: v_minmax_num_f32 v0, v0, v1, v2 ; SDAG-GFX1250-NEXT: s_set_pc_i64 s[30:31] ; @@ -945,18 +933,12 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX11-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true: ; SDAG-GFX11-TRUE16: ; %bb.0: ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l -; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v1.l, v2.l ; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX11-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true: ; SDAG-GFX11-FAKE16: ; %bb.0: ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 -; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; SDAG-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, v0, v1, v2 ; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -985,10 +967,7 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l ; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true: @@ -998,9 +977,6 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; SDAG-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 ; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1034,19 +1010,13 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX1250-TRUE16: ; %bb.0: ; SDAG-GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; SDAG-GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; SDAG-GFX1250-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-GFX1250-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l ; SDAG-GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; SDAG-GFX1250-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true: ; SDAG-GFX1250-FAKE16: ; %bb.0: ; SDAG-GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; SDAG-GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; SDAG-GFX1250-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 ; SDAG-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; @@ -1147,18 +1117,12 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX11-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true: ; SDAG-GFX11-TRUE16: ; %bb.0: ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l -; SDAG-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v1.l, v2.l ; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX11-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true: ; SDAG-GFX11-FAKE16: ; %bb.0: ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 -; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; SDAG-GFX11-FAKE16-NEXT: v_minmax_f16 v0, v0, v1, v2 ; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1187,10 +1151,7 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l ; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true: @@ -1200,9 +1161,6 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; SDAG-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 ; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1236,19 +1194,13 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX1250-TRUE16: ; %bb.0: ; SDAG-GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; SDAG-GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; SDAG-GFX1250-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-GFX1250-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l ; SDAG-GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; SDAG-GFX1250-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true: ; SDAG-GFX1250-FAKE16: ; %bb.0: ; SDAG-GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; SDAG-GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; SDAG-GFX1250-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 ; SDAG-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll index 74ee867959429..d3e609c9e8645 100644 --- a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll +++ b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll @@ -245,10 +245,9 @@ define { float, float } @aggregate_use(float %z) { ; CHECK-NEXT: v_writelane_b32 v41, s31, 1 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_max_f32_e32 v2, v40, v40 +; CHECK-NEXT: v_min_f32_e32 v0, v0, v40 +; CHECK-NEXT: v_min_f32_e32 v1, v1, v40 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: v_min_f32_e32 v0, v0, v2 -; CHECK-NEXT: v_min_f32_e32 v1, v1, v2 ; CHECK-NEXT: v_readlane_b32 s31, v41, 1 ; CHECK-NEXT: v_readlane_b32 s30, v41, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 9afaab5ebcfb6..f42584ffd823a 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -875,10 +875,7 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX6-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_waitcnt vmcnt(1) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -902,10 +899,7 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; GFX8-NEXT: s_mov_b32 s4, s0 ; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/reduction.ll b/llvm/test/CodeGen/AMDGPU/reduction.ll index 291eccd405b8a..6c248e08b1ac1 100644 --- a/llvm/test/CodeGen/AMDGPU/reduction.ll +++ b/llvm/test/CodeGen/AMDGPU/reduction.ll @@ -557,8 +557,6 @@ define half @reduction_maxnum_v4f16(<4 x half> %vec4) { ; GFX9-LABEL: reduction_maxnum_v4f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -566,11 +564,7 @@ define half @reduction_maxnum_v4f16(<4 x half> %vec4) { ; VI-LABEL: reduction_maxnum_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_max_f16_e32 v2, v3, v2 +; VI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -587,8 +581,6 @@ define half @reduction_minnum_v4f16(<4 x half> %vec4) { ; GFX9-LABEL: reduction_minnum_v4f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -596,11 +588,7 @@ define half @reduction_minnum_v4f16(<4 x half> %vec4) { ; VI-LABEL: reduction_minnum_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_min_f16_e32 v2, v3, v2 +; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -619,8 +607,6 @@ define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) { ; GFX9-LABEL: reduction_fast_max_pattern_v4f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -628,11 +614,7 @@ define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) { ; VI-LABEL: reduction_fast_max_pattern_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_max_f16_e32 v2, v3, v2 +; VI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -653,8 +635,6 @@ define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) { ; GFX9-LABEL: reduction_fast_min_pattern_v4f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -662,11 +642,7 @@ define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) { ; VI-LABEL: reduction_fast_min_pattern_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_min_f16_e32 v2, v3, v2 +; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll index 44b8b8bcb9ae8..f27916b13536d 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll @@ -40,9 +40,7 @@ define half @test_vector_reduce_fmax_v2half(<2 x half> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v2half: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmax_v2half: @@ -56,9 +54,7 @@ define half @test_vector_reduce_fmax_v2half(<2 x half> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v2half: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_fmax_v2half: @@ -72,9 +68,7 @@ define half @test_vector_reduce_fmax_v2half(<2 x half> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v2half: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_fmax_v2half: @@ -88,9 +82,6 @@ define half @test_vector_reduce_fmax_v2half(<2 x half> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v2half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -98,9 +89,7 @@ define half @test_vector_reduce_fmax_v2half(<2 x half> %v) { ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -130,9 +119,6 @@ define half @test_vector_reduce_fmax_v2half(<2 x half> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -144,9 +130,7 @@ define half @test_vector_reduce_fmax_v2half(<2 x half> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -199,10 +183,7 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v3half: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -219,11 +200,9 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v3half: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX9-SDAG-NEXT: s_movk_i32 s0, 0x7e00 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, s0 -; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX9-SDAG-NEXT: s_nop 0 +; GFX9-SDAG-NEXT: s_mov_b32 s0, 0xfe00 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x5040100 +; GFX9-SDAG-NEXT: v_perm_b32 v1, s0, v1, v2 ; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-SDAG-NEXT: s_nop 0 ; GFX9-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -242,9 +221,8 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v3half: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX10-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX10-SDAG-NEXT: v_pack_b32_f16 v1, v1, 0x7e00 +; GFX10-SDAG-NEXT: s_mov_b32 s4, 0xfe00 +; GFX10-SDAG-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 ; GFX10-SDAG-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX10-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -262,9 +240,7 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v3half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x7e00 -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0xfe00 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h @@ -273,10 +249,9 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) { ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v3half: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_pack_b32_f16 v1, v1, 0x7e00 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s0, 0xfe00 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -314,9 +289,7 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x7e00 -; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v0, v0, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0xfe00 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v0, v0, v1 ; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h @@ -329,13 +302,13 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v0, v0, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_pack_b32_f16 v1, v1, 0x7e00 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_mov_b32 s0, 0xfe00 +; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-SDAG-FAKE16-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -417,13 +390,9 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v4half: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmax_v4half: @@ -441,9 +410,6 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v4half: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX9-SDAG-NEXT: s_nop 0 ; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-SDAG-NEXT: s_nop 0 ; GFX9-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -464,8 +430,6 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v4half: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX10-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX10-SDAG-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX10-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -485,22 +449,17 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v4half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v4half: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -542,10 +501,8 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v0, v0, v0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -556,12 +513,9 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v0, v0, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -686,21 +640,13 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v8half: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v4 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmax_v8half: @@ -726,10 +672,6 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v8half: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX9-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX9-SDAG-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-SDAG-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-SDAG-NEXT: s_nop 0 @@ -761,10 +703,6 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v8half: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX10-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX10-SDAG-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX10-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX10-SDAG-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX10-SDAG-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX10-SDAG-NEXT: v_pk_max_f16 v0, v0, v1 @@ -794,11 +732,6 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v8half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -809,11 +742,6 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) { ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v8half: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -878,11 +806,6 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v3, v3, v3 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v2, v2, v2 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v0, v0, v0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v1, v1, v3 ; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v0, v0, v2 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -897,11 +820,6 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v3, v3, v3 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v2, v2, v2 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v0, v0, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v1, v1, v3 ; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v0, v0, v2 ; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1121,37 +1039,21 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v16half: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v9 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v8 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v4, v4 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v5, v5 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v6, v6 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v7, v7 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v4 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v6 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v7 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmax_v16half: @@ -1193,10 +1095,8 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v16half: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f16_sdwa v15, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX9-SDAG-NEXT: v_max_f16_e32 v0, v0, v15 +; GFX9-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v2 ; GFX9-SDAG-NEXT: v_max3_f16 v0, v0, v1, v14 ; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v3 @@ -1251,14 +1151,12 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v16half: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f16_sdwa v8, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX10-SDAG-NEXT: v_max_f16_e32 v0, v0, v8 -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX10-SDAG-NEXT: v_max3_f16 v0, v0, v1, v9 +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-SDAG-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX10-SDAG-NEXT: v_max3_f16 v0, v0, v1, v8 ; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX10-SDAG-NEXT: v_max3_f16 v0, v0, v2, v8 +; GFX10-SDAG-NEXT: v_max3_f16 v0, v0, v2, v9 ; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX10-SDAG-NEXT: v_max3_f16 v0, v0, v3, v1 ; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v5 @@ -1309,19 +1207,17 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v16half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_max3_f16 v0.l, v0.l, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_max3_f16 v0.l, v0.l, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: v_max3_f16 v0.l, v0.l, v2.l, v2.h -; GFX11-SDAG-TRUE16-NEXT: v_max3_f16 v0.l, v0.l, v3.l, v3.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_max3_f16 v0.l, v0.l, v3.l, v3.h ; GFX11-SDAG-TRUE16-NEXT: v_max3_f16 v0.l, v0.l, v4.l, v4.h -; GFX11-SDAG-TRUE16-NEXT: v_max3_f16 v0.l, v0.l, v5.l, v5.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_max3_f16 v0.l, v0.l, v5.l, v5.h ; GFX11-SDAG-TRUE16-NEXT: v_max3_f16 v0.l, v0.l, v6.l, v6.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_max3_f16 v0.l, v0.l, v7.l, v7.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1329,27 +1225,24 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) { ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v8, v8, v8 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v0, v0, v8 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_max3_f16 v0, v0, v1, v9 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_max3_f16 v0, v0, v2, v8 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_max3_f16 v0, v0, v3, v1 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_max3_f16 v0, v0, v4, v2 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_max3_f16 v0, v0, v5, v1 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_max3_f16 v0, v0, v6, v2 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_max3_f16 v0, v0, v7, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1445,19 +1338,17 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v1.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v1.h ; GFX12-SDAG-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v2.l, v2.h -; GFX12-SDAG-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v3.l, v3.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v3.l, v3.h ; GFX12-SDAG-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v4.l, v4.h -; GFX12-SDAG-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v5.l, v5.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v5.l, v5.h ; GFX12-SDAG-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v6.l, v6.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v7.l, v7.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1469,27 +1360,24 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v8, v8, v8 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v8 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v9 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_max3_num_f16 v0, v0, v2, v8 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_max3_num_f16 v0, v0, v3, v1 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_max3_num_f16 v0, v0, v4, v2 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_max3_num_f16 v0, v0, v5, v1 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_max3_num_f16 v0, v0, v6, v2 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_max3_num_f16 v0, v0, v7, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1594,8 +1482,6 @@ define float @test_vector_reduce_fmax_v2float(<2 x float> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v2float: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1610,8 +1496,6 @@ define float @test_vector_reduce_fmax_v2float(<2 x float> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v2float: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX8-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1626,8 +1510,6 @@ define float @test_vector_reduce_fmax_v2float(<2 x float> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v2float: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1642,8 +1524,6 @@ define float @test_vector_reduce_fmax_v2float(<2 x float> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v2float: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1658,8 +1538,6 @@ define float @test_vector_reduce_fmax_v2float(<2 x float> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v2float: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1678,8 +1556,6 @@ define float @test_vector_reduce_fmax_v2float(<2 x float> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max_num_f32_e32 v0, v0, v1 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1810,8 +1686,6 @@ define float @test_vector_reduce_fmax_v4float(<4 x float> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v4float: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1831,8 +1705,6 @@ define float @test_vector_reduce_fmax_v4float(<4 x float> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v4float: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX8-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1852,8 +1724,6 @@ define float @test_vector_reduce_fmax_v4float(<4 x float> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v4float: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1873,8 +1743,6 @@ define float @test_vector_reduce_fmax_v4float(<4 x float> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v4float: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX10-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1894,9 +1762,8 @@ define float @test_vector_reduce_fmax_v4float(<4 x float> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v4float: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1917,9 +1784,8 @@ define float @test_vector_reduce_fmax_v4float(<4 x float> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v2, v3 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1945,8 +1811,6 @@ define float @test_vector_reduce_fmax_v8float(<8 x float> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v8float: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 @@ -1976,8 +1840,6 @@ define float @test_vector_reduce_fmax_v8float(<8 x float> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v8float: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX8-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX8-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 @@ -2007,8 +1869,6 @@ define float @test_vector_reduce_fmax_v8float(<8 x float> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v8float: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX9-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 @@ -2038,8 +1898,6 @@ define float @test_vector_reduce_fmax_v8float(<8 x float> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v8float: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX10-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX10-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 @@ -2069,12 +1927,11 @@ define float @test_vector_reduce_fmax_v8float(<8 x float> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v8float: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v6, v7 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +1957,11 @@ define float @test_vector_reduce_fmax_v8float(<8 x float> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max_num_f32_e32 v0, v0, v1 -; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v2, v3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v2, v3 ; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v4, v5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v6, v7 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2136,8 +1992,6 @@ define float @test_vector_reduce_fmax_v16float(<16 x float> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v16float: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 @@ -2187,8 +2041,6 @@ define float @test_vector_reduce_fmax_v16float(<16 x float> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v16float: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX8-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX8-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 @@ -2238,8 +2090,6 @@ define float @test_vector_reduce_fmax_v16float(<16 x float> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v16float: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX9-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 @@ -2289,8 +2139,6 @@ define float @test_vector_reduce_fmax_v16float(<16 x float> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v16float: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX10-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX10-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 @@ -2340,18 +2188,17 @@ define float @test_vector_reduce_fmax_v16float(<16 x float> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v16float: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 ; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 -; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v6, v7 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v6, v7 ; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v8, v9 -; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v10, v11 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v10, v11 ; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v12, v13 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max3_f32 v0, v0, v14, v15 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2389,18 +2236,17 @@ define float @test_vector_reduce_fmax_v16float(<16 x float> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max_num_f32_e32 v0, v0, v1 -; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v2, v3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v2, v3 ; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v4, v5 -; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v6, v7 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v6, v7 ; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v8, v9 -; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v10, v11 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v10, v11 ; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v12, v13 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max3_num_f32 v0, v0, v14, v15 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2444,8 +2290,6 @@ define double @test_vector_reduce_fmax_v2double(<2 x double> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v2double: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2460,8 +2304,6 @@ define double @test_vector_reduce_fmax_v2double(<2 x double> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v2double: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2476,8 +2318,6 @@ define double @test_vector_reduce_fmax_v2double(<2 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v2double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2492,8 +2332,6 @@ define double @test_vector_reduce_fmax_v2double(<2 x double> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v2double: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2508,9 +2346,6 @@ define double @test_vector_reduce_fmax_v2double(<2 x double> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v2double: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2530,9 +2365,6 @@ define double @test_vector_reduce_fmax_v2double(<2 x double> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2557,9 +2389,6 @@ define double @test_vector_reduce_fmax_v3double(<3 x double> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v3double: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2577,9 +2406,6 @@ define double @test_vector_reduce_fmax_v3double(<3 x double> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v3double: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2597,11 +2423,8 @@ define double @test_vector_reduce_fmax_v3double(<3 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v3double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_fmax_v3double: @@ -2617,9 +2440,6 @@ define double @test_vector_reduce_fmax_v3double(<3 x double> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v3double: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2637,11 +2457,8 @@ define double @test_vector_reduce_fmax_v3double(<3 x double> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v3double: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2663,11 +2480,8 @@ define double @test_vector_reduce_fmax_v3double(<3 x double> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2694,13 +2508,9 @@ define double @test_vector_reduce_fmax_v4double(<4 x double> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v4double: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] ; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v4double: @@ -2718,13 +2528,9 @@ define double @test_vector_reduce_fmax_v4double(<4 x double> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v4double: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] ; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmax_v4double: @@ -2742,13 +2548,9 @@ define double @test_vector_reduce_fmax_v4double(<4 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v4double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_fmax_v4double: @@ -2766,13 +2568,9 @@ define double @test_vector_reduce_fmax_v4double(<4 x double> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v4double: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] ; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_fmax_v4double: @@ -2790,15 +2588,10 @@ define double @test_vector_reduce_fmax_v4double(<4 x double> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v4double: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_fmax_v4double: @@ -2822,15 +2615,10 @@ define double @test_vector_reduce_fmax_v4double(<4 x double> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[6:7], v[6:7] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_fmax_v4double: @@ -2859,21 +2647,13 @@ define double @test_vector_reduce_fmax_v8double(<8 x double> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v8double: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[10:11], v[10:11] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[12:13], v[12:13] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[14:15], v[14:15] ; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[10:11] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[12:13] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[14:15] ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v8double: @@ -2899,21 +2679,13 @@ define double @test_vector_reduce_fmax_v8double(<8 x double> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v8double: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[10:11], v[10:11] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[12:13], v[12:13] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[14:15], v[14:15] ; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[10:11] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[12:13] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[14:15] ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmax_v8double: @@ -2939,21 +2711,13 @@ define double @test_vector_reduce_fmax_v8double(<8 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v8double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[12:13], v[12:13] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[10:11] +; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[12:13] +; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[14:15] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_fmax_v8double: @@ -2979,21 +2743,13 @@ define double @test_vector_reduce_fmax_v8double(<8 x double> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmax_v8double: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[8:9], v[8:9] ; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] ; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[10:11] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[12:13] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[14:15] ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_fmax_v8double: @@ -3019,25 +2775,16 @@ define double @test_vector_reduce_fmax_v8double(<8 x double> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v8double: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[8:9], v[8:9] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[10:11] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[12:13] +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[14:15] ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_fmax_v8double: @@ -3069,25 +2816,16 @@ define double @test_vector_reduce_fmax_v8double(<8 x double> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[6:7], v[6:7] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[8:9] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[10:11], v[10:11] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[12:13], v[12:13] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[14:15], v[14:15] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[8:9] +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[10:11] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[12:13] +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[14:15] ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_fmax_v8double: @@ -3124,39 +2862,23 @@ define double @test_vector_reduce_fmax_v16double(<16 x double> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v16double: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] ; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[16:17], v[16:17] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[18:19], v[18:19] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[20:21], v[20:21] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[22:23], v[22:23] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[24:25], v[24:25] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[26:27], v[26:27] +; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[28:29], v[28:29] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[10:11] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[12:13] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[14:15] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[16:17] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[18:19] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[20:21] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[22:23] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[24:25] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[26:27] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[28:29] ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[30:31], v[30:31] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[30:31] ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v16double: @@ -3200,39 +2922,23 @@ define double @test_vector_reduce_fmax_v16double(<16 x double> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v16double: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[16:17], v[16:17] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[18:19], v[18:19] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[20:21], v[20:21] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[22:23], v[22:23] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[24:25], v[24:25] ; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[26:27], v[26:27] +; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[28:29], v[28:29] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[10:11] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[12:13] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[14:15] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[16:17] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[18:19] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[20:21] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[22:23] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[24:25] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[26:27] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[28:29] ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[30:31], v[30:31] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[30:31] ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmax_v16double: @@ -3277,38 +2983,22 @@ define double @test_vector_reduce_fmax_v16double(<16 x double> %v) { ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-SDAG-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX9-SDAG-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] -; GFX9-SDAG-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[10:11] -; GFX9-SDAG-NEXT: v_max_f64 v[14:15], v[14:15], v[14:15] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[12:13] -; GFX9-SDAG-NEXT: v_max_f64 v[16:17], v[16:17], v[16:17] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[14:15] -; GFX9-SDAG-NEXT: v_max_f64 v[18:19], v[18:19], v[18:19] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[16:17] -; GFX9-SDAG-NEXT: v_max_f64 v[20:21], v[20:21], v[20:21] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[18:19] -; GFX9-SDAG-NEXT: v_max_f64 v[22:23], v[22:23], v[22:23] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[20:21] -; GFX9-SDAG-NEXT: v_max_f64 v[24:25], v[24:25], v[24:25] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[22:23] -; GFX9-SDAG-NEXT: v_max_f64 v[26:27], v[26:27], v[26:27] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[24:25] -; GFX9-SDAG-NEXT: v_max_f64 v[28:29], v[28:29], v[28:29] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[26:27] ; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[28:29] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[30:31], v[30:31] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[30:31] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_fmax_v16double: @@ -3353,38 +3043,22 @@ define double @test_vector_reduce_fmax_v16double(<16 x double> %v) { ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[8:9], v[8:9] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[16:17], v[16:17] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[18:19], v[18:19] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[20:21], v[20:21] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[22:23], v[22:23] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[24:25], v[24:25] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[26:27], v[26:27] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[28:29], v[28:29] ; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[10:11] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[12:13] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[14:15] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[16:17] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[18:19] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[20:21] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[22:23] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[24:25] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[26:27] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[28:29] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[30:31], v[30:31] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[30:31] ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_fmax_v16double: @@ -3428,46 +3102,30 @@ define double @test_vector_reduce_fmax_v16double(<16 x double> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v16double: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[8:9], v[8:9] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[16:17], v[16:17] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[18:19], v[18:19] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[20:21], v[20:21] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[22:23], v[22:23] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[24:25], v[24:25] ; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[26:27], v[26:27] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[28:29], v[28:29] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[10:11] +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[12:13] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[14:15] +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[16:17] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[18:19] +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[20:21] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[22:23] +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[24:25] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[26:27] +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[28:29] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[30:31], v[30:31] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[30:31] ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_fmax_v16double: @@ -3518,46 +3176,30 @@ define double @test_vector_reduce_fmax_v16double(<16 x double> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] -; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[6:7], v[6:7] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[8:9] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[10:11], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[12:13], v[12:13] ; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[14:15], v[14:15] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[16:17], v[16:17] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[18:19], v[18:19] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[20:21], v[20:21] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[22:23], v[22:23] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[24:25], v[24:25] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[26:27], v[26:27] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 ; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[28:29], v[28:29] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[8:9] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[10:11] +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[12:13] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[14:15] +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[16:17] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[18:19] +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[20:21] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[22:23] +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[24:25] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[26:27] +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[28:29] ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[30:31], v[30:31] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[30:31] ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_fmax_v16double: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll index ed5c910def3d6..6ad1101341afe 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll @@ -40,9 +40,7 @@ define half @test_vector_reduce_fmin_v2half(<2 x half> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v2half: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmin_v2half: @@ -56,9 +54,7 @@ define half @test_vector_reduce_fmin_v2half(<2 x half> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v2half: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_fmin_v2half: @@ -72,9 +68,7 @@ define half @test_vector_reduce_fmin_v2half(<2 x half> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v2half: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_fmin_v2half: @@ -88,9 +82,6 @@ define half @test_vector_reduce_fmin_v2half(<2 x half> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v2half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -98,9 +89,7 @@ define half @test_vector_reduce_fmin_v2half(<2 x half> %v) { ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -130,9 +119,6 @@ define half @test_vector_reduce_fmin_v2half(<2 x half> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_min_num_f16_e32 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -144,9 +130,7 @@ define half @test_vector_reduce_fmin_v2half(<2 x half> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -199,10 +183,7 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v3half: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -219,11 +200,9 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v3half: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-SDAG-NEXT: s_movk_i32 s0, 0x7e00 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, s0 -; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX9-SDAG-NEXT: s_nop 0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x5040100 +; GFX9-SDAG-NEXT: v_perm_b32 v1, s0, v1, v2 ; GFX9-SDAG-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-SDAG-NEXT: s_nop 0 ; GFX9-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -242,9 +221,8 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v3half: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX10-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX10-SDAG-NEXT: v_pack_b32_f16 v1, v1, 0x7e00 +; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; GFX10-SDAG-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 ; GFX10-SDAG-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX10-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -263,8 +241,6 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) { ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x7e00 -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h @@ -273,10 +249,9 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) { ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v3half: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_pack_b32_f16 v1, v1, 0x7e00 +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x7e00 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -315,8 +290,6 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x7e00 -; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v0, v0, v0 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_pk_min_num_f16 v0, v0, v1 ; GFX12-SDAG-TRUE16-NEXT: v_min_num_f16_e32 v0.l, v0.l, v0.h @@ -329,13 +302,13 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v0, v0, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_pack_b32_f16 v1, v1, 0x7e00 -; GFX12-SDAG-FAKE16-NEXT: v_pk_min_num_f16 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x7e00 +; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-SDAG-FAKE16-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_pk_min_num_f16 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -417,13 +390,9 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v4half: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmin_v4half: @@ -441,9 +410,6 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v4half: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX9-SDAG-NEXT: s_nop 0 ; GFX9-SDAG-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-SDAG-NEXT: s_nop 0 ; GFX9-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -464,8 +430,6 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v4half: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX10-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX10-SDAG-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX10-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -485,22 +449,17 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v4half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v4half: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -542,10 +501,8 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v0, v0, v0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_pk_min_num_f16 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_min_num_f16_e32 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -556,12 +513,9 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v0, v0, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_pk_min_num_f16 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -686,21 +640,13 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v8half: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v4 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v3, v3 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmin_v8half: @@ -726,10 +672,6 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v8half: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX9-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX9-SDAG-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-SDAG-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-SDAG-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-SDAG-NEXT: s_nop 0 @@ -761,10 +703,6 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v8half: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX10-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX10-SDAG-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX10-SDAG-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX10-SDAG-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX10-SDAG-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX10-SDAG-NEXT: v_pk_min_f16 v0, v0, v1 @@ -794,11 +732,6 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v8half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -809,11 +742,6 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) { ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v8half: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v3, v3, v3 -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -878,11 +806,6 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v3, v3, v3 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v2, v2, v2 -; GFX12-SDAG-TRUE16-NEXT: v_pk_max_num_f16 v0, v0, v0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_pk_min_num_f16 v1, v1, v3 ; GFX12-SDAG-TRUE16-NEXT: v_pk_min_num_f16 v0, v0, v2 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -897,11 +820,6 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v3, v3, v3 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v1, v1, v1 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v2, v2, v2 -; GFX12-SDAG-FAKE16-NEXT: v_pk_max_num_f16 v0, v0, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_pk_min_num_f16 v1, v1, v3 ; GFX12-SDAG-FAKE16-NEXT: v_pk_min_num_f16 v0, v0, v2 ; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1121,37 +1039,21 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v16half: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v9 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v8 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v3, v3 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v4, v4 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v5, v5 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v6, v6 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v7, v7 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v1, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v4 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v6 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v7 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmin_v16half: @@ -1193,10 +1095,8 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v16half: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f16_sdwa v15, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX9-SDAG-NEXT: v_min_f16_e32 v0, v0, v15 +; GFX9-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v2 ; GFX9-SDAG-NEXT: v_min3_f16 v0, v0, v1, v14 ; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v3 @@ -1251,14 +1151,12 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v16half: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f16_sdwa v8, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX10-SDAG-NEXT: v_min_f16_e32 v0, v0, v8 -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX10-SDAG-NEXT: v_min3_f16 v0, v0, v1, v9 +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-SDAG-NEXT: v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX10-SDAG-NEXT: v_min3_f16 v0, v0, v1, v8 ; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX10-SDAG-NEXT: v_min3_f16 v0, v0, v2, v8 +; GFX10-SDAG-NEXT: v_min3_f16 v0, v0, v2, v9 ; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX10-SDAG-NEXT: v_min3_f16 v0, v0, v3, v1 ; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v5 @@ -1309,19 +1207,17 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v16half: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_min3_f16 v0.l, v0.l, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_min3_f16 v0.l, v0.l, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: v_min3_f16 v0.l, v0.l, v2.l, v2.h -; GFX11-SDAG-TRUE16-NEXT: v_min3_f16 v0.l, v0.l, v3.l, v3.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_min3_f16 v0.l, v0.l, v3.l, v3.h ; GFX11-SDAG-TRUE16-NEXT: v_min3_f16 v0.l, v0.l, v4.l, v4.h -; GFX11-SDAG-TRUE16-NEXT: v_min3_f16 v0.l, v0.l, v5.l, v5.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_min3_f16 v0.l, v0.l, v5.l, v5.h ; GFX11-SDAG-TRUE16-NEXT: v_min3_f16 v0.l, v0.l, v6.l, v6.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_min3_f16 v0.l, v0.l, v7.l, v7.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1329,27 +1225,24 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) { ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v8, v8, v8 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v0, v8 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_min3_f16 v0, v0, v1, v9 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_min3_f16 v0, v0, v2, v8 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_min3_f16 v0, v0, v3, v1 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_min3_f16 v0, v0, v4, v2 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-FAKE16-NEXT: v_min3_f16 v0, v0, v5, v1 ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_min3_f16 v0, v0, v6, v2 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_min3_f16 v0, v0, v7, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1445,19 +1338,17 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_min_num_f16_e32 v0.l, v0.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v1.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v1.h ; GFX12-SDAG-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v2.l, v2.h -; GFX12-SDAG-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v3.l, v3.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v3.l, v3.h ; GFX12-SDAG-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v4.l, v4.h -; GFX12-SDAG-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v5.l, v5.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v5.l, v5.h ; GFX12-SDAG-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v6.l, v6.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v7.l, v7.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1469,27 +1360,24 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_max_num_f16_e32 v8, v8, v8 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v8 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_min3_num_f16 v0, v0, v1, v9 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_min3_num_f16 v0, v0, v2, v8 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_min3_num_f16 v0, v0, v3, v1 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_min3_num_f16 v0, v0, v4, v2 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-FAKE16-NEXT: v_min3_num_f16 v0, v0, v5, v1 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_min3_num_f16 v0, v0, v6, v2 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_min3_num_f16 v0, v0, v7, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1594,8 +1482,6 @@ define float @test_vector_reduce_fmin_v2float(<2 x float> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v2float: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1610,8 +1496,6 @@ define float @test_vector_reduce_fmin_v2float(<2 x float> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v2float: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX8-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1626,8 +1510,6 @@ define float @test_vector_reduce_fmin_v2float(<2 x float> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v2float: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1642,8 +1524,6 @@ define float @test_vector_reduce_fmin_v2float(<2 x float> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v2float: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1658,8 +1538,6 @@ define float @test_vector_reduce_fmin_v2float(<2 x float> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v2float: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1678,8 +1556,6 @@ define float @test_vector_reduce_fmin_v2float(<2 x float> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min_num_f32_e32 v0, v0, v1 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1810,8 +1686,6 @@ define float @test_vector_reduce_fmin_v4float(<4 x float> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v4float: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1831,8 +1705,6 @@ define float @test_vector_reduce_fmin_v4float(<4 x float> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v4float: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX8-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1852,8 +1724,6 @@ define float @test_vector_reduce_fmin_v4float(<4 x float> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v4float: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1873,8 +1743,6 @@ define float @test_vector_reduce_fmin_v4float(<4 x float> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v4float: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX10-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1894,9 +1762,8 @@ define float @test_vector_reduce_fmin_v4float(<4 x float> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v4float: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1917,9 +1784,8 @@ define float @test_vector_reduce_fmin_v4float(<4 x float> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v2, v3 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1945,8 +1811,6 @@ define float @test_vector_reduce_fmin_v8float(<8 x float> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v8float: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 @@ -1976,8 +1840,6 @@ define float @test_vector_reduce_fmin_v8float(<8 x float> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v8float: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX8-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX8-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 @@ -2007,8 +1869,6 @@ define float @test_vector_reduce_fmin_v8float(<8 x float> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v8float: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX9-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 @@ -2038,8 +1898,6 @@ define float @test_vector_reduce_fmin_v8float(<8 x float> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v8float: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX10-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX10-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 @@ -2069,12 +1927,11 @@ define float @test_vector_reduce_fmin_v8float(<8 x float> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v8float: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v6, v7 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +1957,11 @@ define float @test_vector_reduce_fmin_v8float(<8 x float> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min_num_f32_e32 v0, v0, v1 -; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v2, v3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v2, v3 ; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v4, v5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v6, v7 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2136,8 +1992,6 @@ define float @test_vector_reduce_fmin_v16float(<16 x float> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v16float: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 @@ -2187,8 +2041,6 @@ define float @test_vector_reduce_fmin_v16float(<16 x float> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v16float: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX8-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX8-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 @@ -2238,8 +2090,6 @@ define float @test_vector_reduce_fmin_v16float(<16 x float> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v16float: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX9-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 @@ -2289,8 +2139,6 @@ define float @test_vector_reduce_fmin_v16float(<16 x float> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v16float: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-SDAG-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX10-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX10-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 @@ -2340,18 +2188,17 @@ define float @test_vector_reduce_fmin_v16float(<16 x float> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v16float: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 ; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 -; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v6, v7 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v6, v7 ; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v8, v9 -; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v10, v11 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v10, v11 ; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v12, v13 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min3_f32 v0, v0, v14, v15 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2389,18 +2236,17 @@ define float @test_vector_reduce_fmin_v16float(<16 x float> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min_num_f32_e32 v0, v0, v1 -; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v2, v3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v2, v3 ; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v4, v5 -; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v6, v7 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v6, v7 ; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v8, v9 -; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v10, v11 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v10, v11 ; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v12, v13 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min3_num_f32 v0, v0, v14, v15 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2443,8 +2289,6 @@ define double @test_vector_reduce_fmin_v2double(<2 x double> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v2double: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2459,8 +2303,6 @@ define double @test_vector_reduce_fmin_v2double(<2 x double> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v2double: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2475,8 +2317,6 @@ define double @test_vector_reduce_fmin_v2double(<2 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v2double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2491,8 +2331,6 @@ define double @test_vector_reduce_fmin_v2double(<2 x double> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v2double: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2507,9 +2345,6 @@ define double @test_vector_reduce_fmin_v2double(<2 x double> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v2double: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2529,9 +2364,6 @@ define double @test_vector_reduce_fmin_v2double(<2 x double> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2556,9 +2388,6 @@ define double @test_vector_reduce_fmin_v3double(<3 x double> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v3double: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2576,9 +2405,6 @@ define double @test_vector_reduce_fmin_v3double(<3 x double> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v3double: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2596,11 +2422,8 @@ define double @test_vector_reduce_fmin_v3double(<3 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v3double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_fmin_v3double: @@ -2616,9 +2439,6 @@ define double @test_vector_reduce_fmin_v3double(<3 x double> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v3double: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2636,11 +2456,8 @@ define double @test_vector_reduce_fmin_v3double(<3 x double> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v3double: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2662,11 +2479,8 @@ define double @test_vector_reduce_fmin_v3double(<3 x double> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2693,13 +2507,9 @@ define double @test_vector_reduce_fmin_v4double(<4 x double> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v4double: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] ; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v4double: @@ -2717,13 +2527,9 @@ define double @test_vector_reduce_fmin_v4double(<4 x double> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v4double: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] ; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmin_v4double: @@ -2741,13 +2547,9 @@ define double @test_vector_reduce_fmin_v4double(<4 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v4double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_fmin_v4double: @@ -2765,13 +2567,9 @@ define double @test_vector_reduce_fmin_v4double(<4 x double> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v4double: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] ; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_fmin_v4double: @@ -2789,15 +2587,10 @@ define double @test_vector_reduce_fmin_v4double(<4 x double> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v4double: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_fmin_v4double: @@ -2821,15 +2614,10 @@ define double @test_vector_reduce_fmin_v4double(<4 x double> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[6:7], v[6:7] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_fmin_v4double: @@ -2858,21 +2646,13 @@ define double @test_vector_reduce_fmin_v8double(<8 x double> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v8double: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[10:11], v[10:11] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[12:13], v[12:13] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[14:15], v[14:15] ; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[10:11] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[12:13] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[14:15] ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v8double: @@ -2898,21 +2678,13 @@ define double @test_vector_reduce_fmin_v8double(<8 x double> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v8double: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[10:11], v[10:11] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[12:13], v[12:13] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[14:15], v[14:15] ; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[10:11] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[12:13] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[14:15] ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmin_v8double: @@ -2938,21 +2710,13 @@ define double @test_vector_reduce_fmin_v8double(<8 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v8double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] -; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[12:13], v[12:13] -; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[10:11] +; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[12:13] +; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[14:15] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_fmin_v8double: @@ -2978,21 +2742,13 @@ define double @test_vector_reduce_fmin_v8double(<8 x double> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_fmin_v8double: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[8:9], v[8:9] ; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] ; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[10:11] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[12:13] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[14:15] ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_fmin_v8double: @@ -3018,25 +2774,16 @@ define double @test_vector_reduce_fmin_v8double(<8 x double> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v8double: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[8:9], v[8:9] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[10:11] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[12:13] +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[14:15] ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_fmin_v8double: @@ -3068,25 +2815,16 @@ define double @test_vector_reduce_fmin_v8double(<8 x double> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[6:7], v[6:7] -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[8:9] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[10:11], v[10:11] -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[12:13], v[12:13] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[14:15], v[14:15] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[8:9] +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[10:11] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[12:13] +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[14:15] ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_fmin_v8double: @@ -3123,39 +2861,23 @@ define double @test_vector_reduce_fmin_v16double(<16 x double> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v16double: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] ; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[16:17], v[16:17] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[18:19], v[18:19] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[20:21], v[20:21] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[22:23], v[22:23] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[24:25], v[24:25] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[26:27], v[26:27] +; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_max_f64 v[4:5], v[28:29], v[28:29] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[10:11] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[12:13] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[14:15] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[16:17] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[18:19] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[20:21] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[22:23] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[24:25] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[26:27] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[28:29] ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_max_f64 v[2:3], v[30:31], v[30:31] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[30:31] ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v16double: @@ -3199,39 +2921,23 @@ define double @test_vector_reduce_fmin_v16double(<16 x double> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v16double: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[16:17], v[16:17] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[18:19], v[18:19] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[20:21], v[20:21] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[22:23], v[22:23] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[24:25], v[24:25] ; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[26:27], v[26:27] +; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_max_f64 v[4:5], v[28:29], v[28:29] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[10:11] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[12:13] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[14:15] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[16:17] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[18:19] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[20:21] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[22:23] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[24:25] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[26:27] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[28:29] ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX8-SDAG-NEXT: v_max_f64 v[2:3], v[30:31], v[30:31] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[30:31] ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_fmin_v16double: @@ -3276,38 +2982,22 @@ define double @test_vector_reduce_fmin_v16double(<16 x double> %v) { ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-SDAG-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX9-SDAG-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] -; GFX9-SDAG-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[10:11] -; GFX9-SDAG-NEXT: v_max_f64 v[14:15], v[14:15], v[14:15] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[12:13] -; GFX9-SDAG-NEXT: v_max_f64 v[16:17], v[16:17], v[16:17] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[14:15] -; GFX9-SDAG-NEXT: v_max_f64 v[18:19], v[18:19], v[18:19] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[16:17] -; GFX9-SDAG-NEXT: v_max_f64 v[20:21], v[20:21], v[20:21] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[18:19] -; GFX9-SDAG-NEXT: v_max_f64 v[22:23], v[22:23], v[22:23] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[20:21] -; GFX9-SDAG-NEXT: v_max_f64 v[24:25], v[24:25], v[24:25] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[22:23] -; GFX9-SDAG-NEXT: v_max_f64 v[26:27], v[26:27], v[26:27] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[24:25] -; GFX9-SDAG-NEXT: v_max_f64 v[28:29], v[28:29], v[28:29] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[26:27] ; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[28:29] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_max_f64 v[2:3], v[30:31], v[30:31] -; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[30:31] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_fmin_v16double: @@ -3352,38 +3042,22 @@ define double @test_vector_reduce_fmin_v16double(<16 x double> %v) { ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[8:9], v[8:9] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[16:17], v[16:17] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[18:19], v[18:19] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[20:21], v[20:21] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[22:23], v[22:23] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[24:25], v[24:25] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[26:27], v[26:27] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_max_f64 v[4:5], v[28:29], v[28:29] ; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[10:11] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[12:13] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[14:15] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[16:17] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[18:19] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[20:21] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[22:23] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[24:25] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[26:27] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[28:29] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_max_f64 v[2:3], v[30:31], v[30:31] -; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[30:31] ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_fmin_v16double: @@ -3427,46 +3101,30 @@ define double @test_vector_reduce_fmin_v16double(<16 x double> %v) { ; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v16double: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[8:9], v[8:9] -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[10:11], v[10:11] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[12:13], v[12:13] -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[14:15], v[14:15] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[16:17], v[16:17] -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[18:19], v[18:19] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[20:21], v[20:21] -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[22:23], v[22:23] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[24:25], v[24:25] ; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[26:27], v[26:27] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_max_f64 v[4:5], v[28:29], v[28:29] -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[10:11] +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[12:13] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[14:15] +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[16:17] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[18:19] +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[20:21] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[22:23] +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[24:25] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[26:27] +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[28:29] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_max_f64 v[2:3], v[30:31], v[30:31] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], v[30:31] ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_fmin_v16double: @@ -3517,46 +3175,30 @@ define double @test_vector_reduce_fmin_v16double(<16 x double> %v) { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] -; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[6:7], v[6:7] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[8:9] -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[10:11], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[12:13], v[12:13] ; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[14:15], v[14:15] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[16:17], v[16:17] -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[18:19], v[18:19] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[20:21], v[20:21] -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[22:23], v[22:23] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[24:25], v[24:25] -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[26:27], v[26:27] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 ; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[28:29], v[28:29] -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[8:9] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[10:11] +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[12:13] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[14:15] +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[16:17] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[18:19] +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[20:21] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[22:23] +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[24:25] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[26:27] +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[28:29] ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[30:31], v[30:31] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[30:31] ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_fmin_v16double: diff --git a/llvm/test/CodeGen/Hexagon/expandFMINNUM_FMAXNUM-v67.ll b/llvm/test/CodeGen/Hexagon/expandFMINNUM_FMAXNUM-v67.ll new file mode 100644 index 0000000000000..3317503f9166f --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/expandFMINNUM_FMAXNUM-v67.ll @@ -0,0 +1,233 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=hexagon -mcpu=hexagonv67 < %s | FileCheck %s + +; test expandFMINNUM_FMAXNUM on backend with FMINIMUMNUM/FMAXIMUMNUM. +; N -> not NaN +; S -> not SNaN +; Y -> may be NaN or SNaN + +define double @test_maxnumNN(double nofpclass(nan) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_maxnumNN: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = dfmax(r1:0,r3:2) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumSS(double nofpclass(snan) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_maxnumSS: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = dfmax(r1:0,r3:2) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumYY(double nofpclass(zero) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_maxnumYY: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: jump fmax +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumNS(double nofpclass(nan) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_maxnumNS: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = dfmax(r1:0,r3:2) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumNY(double nofpclass(nan) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_maxnumNY: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: jump fmax +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumSN(double nofpclass(snan) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_maxnumSN: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = dfmax(r1:0,r3:2) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumSY(double nofpclass(snan) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_maxnumSY: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: jump fmax +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumYN(double nofpclass(zero) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_maxnumYN: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: jump fmax +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumYS(double nofpclass(zero) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_maxnumYS: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: jump fmax +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + + +define double @test_minnumNN(double nofpclass(nan) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_minnumNN: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = dfmin(r1:0,r3:2) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumSS(double nofpclass(snan) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_minnumSS: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = dfmin(r1:0,r3:2) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumYY(double nofpclass(zero) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_minnumYY: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: jump fmin +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumNS(double nofpclass(nan) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_minnumNS: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = dfmin(r1:0,r3:2) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumNY(double nofpclass(nan) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_minnumNY: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: jump fmin +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumSN(double nofpclass(snan) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_minnumSN: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: r1:0 = dfmin(r1:0,r3:2) +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumSY(double nofpclass(snan) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_minnumSY: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: jump fmin +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumYN(double nofpclass(zero) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_minnumYN: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: jump fmin +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumYS(double nofpclass(zero) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_minnumYS: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: { +; CHECK-NEXT: jump fmin +; CHECK-NEXT: } +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + diff --git a/llvm/test/CodeGen/WebAssembly/expandFMINNUM_FMAXNUM.ll b/llvm/test/CodeGen/WebAssembly/expandFMINNUM_FMAXNUM.ll new file mode 100644 index 0000000000000..eb2931361bcc9 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/expandFMINNUM_FMAXNUM.ll @@ -0,0 +1,243 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=wasm32 -disable-wasm-fallthrough-return-opt -wasm-keep-registers < %s | FileCheck %s + +; test expandFMINNUM_FMAXNUM on backend with FMINIMUM/FMAXIMUM. +; N -> not NaN +; S -> not SNaN +; Y -> may be NaN or SNaN + +define double @test_maxnumNN(double nofpclass(nan) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_maxnumNN: +; CHECK: .functype test_maxnumNN (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: f64.max $push0=, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumSS(double nofpclass(snan) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_maxnumSS: +; CHECK: .functype test_maxnumSS (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmax, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumYY(double nofpclass(zero) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_maxnumYY: +; CHECK: .functype test_maxnumYY (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmax, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumNS(double nofpclass(nan) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_maxnumNS: +; CHECK: .functype test_maxnumNS (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmax, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumNY(double nofpclass(nan) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_maxnumNY: +; CHECK: .functype test_maxnumNY (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmax, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumSN(double nofpclass(snan) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_maxnumSN: +; CHECK: .functype test_maxnumSN (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmax, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumSY(double nofpclass(snan) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_maxnumSY: +; CHECK: .functype test_maxnumSY (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmax, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumYN(double nofpclass(zero) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_maxnumYN: +; CHECK: .functype test_maxnumYN (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmax, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_maxnumYS(double nofpclass(zero) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_maxnumYS: +; CHECK: .functype test_maxnumYS (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmax, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.maxnum.f64(double %f1, double %f2) + ret double %0 +} + + +define double @test_minnumNN(double nofpclass(nan) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_minnumNN: +; CHECK: .functype test_minnumNN (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: f64.min $push0=, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumSS(double nofpclass(snan) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_minnumSS: +; CHECK: .functype test_minnumSS (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmin, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumYY(double nofpclass(zero) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_minnumYY: +; CHECK: .functype test_minnumYY (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmin, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumNS(double nofpclass(nan) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_minnumNS: +; CHECK: .functype test_minnumNS (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmin, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumNY(double nofpclass(nan) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_minnumNY: +; CHECK: .functype test_minnumNY (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmin, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumSN(double nofpclass(snan) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_minnumSN: +; CHECK: .functype test_minnumSN (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmin, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumSY(double nofpclass(snan) %f1, double nofpclass(zero) %f2) { +; CHECK-LABEL: test_minnumSY: +; CHECK: .functype test_minnumSY (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmin, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumYN(double nofpclass(zero) %f1, double nofpclass(nan) %f2) { +; CHECK-LABEL: test_minnumYN: +; CHECK: .functype test_minnumYN (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmin, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + +define double @test_minnumYS(double nofpclass(zero) %f1, double nofpclass(snan) %f2) { +; CHECK-LABEL: test_minnumYS: +; CHECK: .functype test_minnumYS (f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get $push2=, 0 +; CHECK-NEXT: local.get $push1=, 1 +; CHECK-NEXT: call $push0=, fmin, $pop2, $pop1 +; CHECK-NEXT: return $pop0 +entry: + %0 = tail call double @llvm.minnum.f64(double %f1, double %f2) + ret double %0 +} + diff --git a/llvm/test/CodeGen/X86/pr59258.ll b/llvm/test/CodeGen/X86/pr59258.ll index e5f5ca71739df..16e97ad01d6f1 100644 --- a/llvm/test/CodeGen/X86/pr59258.ll +++ b/llvm/test/CodeGen/X86/pr59258.ll @@ -4,7 +4,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-LABEL: cvt_and_clamp2: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $104, %rsp +; CHECK-NEXT: subq $168, %rsp ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm1, %xmm0 @@ -12,18 +12,13 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -31,134 +26,238 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: callq fmaxf@PLT -; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: callq fmaxf@PLT -; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: callq fmaxf@PLT -; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: callq fmaxf@PLT +; CHECK-NEXT: maxss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss (%rsp), %xmm0 # 4-byte Reload +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: maxss %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: maxss %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: callq fmaxf@PLT +; CHECK-NEXT: maxss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: callq fmaxf@PLT +; CHECK-NEXT: maxss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: callq fmaxf@PLT +; CHECK-NEXT: maxss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: callq fmaxf@PLT +; CHECK-NEXT: maxss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: callq fminf@PLT -; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: callq fminf@PLT -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: callq fminf@PLT -; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movd (%rsp), %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: callq fminf@PLT -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: callq fminf@PLT -; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: callq fminf@PLT +; CHECK-NEXT: ucomiss %xmm1, %xmm1 +; CHECK-NEXT: jp .LBB0_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: andps %xmm4, %xmm2 +; CHECK-NEXT: minss %xmm1, %xmm3 +; CHECK-NEXT: orps %xmm2, %xmm3 +; CHECK-NEXT: movaps %xmm4, %xmm2 +; CHECK-NEXT: jnp .LBB0_4 +; CHECK-NEXT: jmp .LBB0_5 +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; CHECK-NEXT: jp .LBB0_5 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: movaps %xmm0, %xmm4 +; CHECK-NEXT: andps %xmm2, %xmm4 +; CHECK-NEXT: minss %xmm1, %xmm0 +; CHECK-NEXT: orps %xmm4, %xmm0 +; CHECK-NEXT: .LBB0_5: +; CHECK-NEXT: jnp .LBB0_6 +; CHECK-NEXT: # %bb.7: +; CHECK-NEXT: jnp .LBB0_8 +; CHECK-NEXT: .LBB0_9: +; CHECK-NEXT: jp .LBB0_11 +; CHECK-NEXT: .LBB0_10: +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; CHECK-NEXT: movaps %xmm5, %xmm4 +; CHECK-NEXT: andps %xmm2, %xmm4 +; CHECK-NEXT: minss %xmm1, %xmm5 +; CHECK-NEXT: orps %xmm4, %xmm5 +; CHECK-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: .LBB0_11: +; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: jp .LBB0_13 +; CHECK-NEXT: # %bb.12: +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: andps %xmm2, %xmm0 +; CHECK-NEXT: minss %xmm1, %xmm3 +; CHECK-NEXT: orps %xmm0, %xmm3 +; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: .LBB0_13: +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: movaps %xmm0, %xmm4 +; CHECK-NEXT: movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: ucomiss %xmm2, %xmm2 +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: jp .LBB0_15 +; CHECK-NEXT: # %bb.14: +; CHECK-NEXT: movaps %xmm4, %xmm1 +; CHECK-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: andps %xmm3, %xmm1 +; CHECK-NEXT: minss %xmm2, %xmm4 +; CHECK-NEXT: orps %xmm1, %xmm4 +; CHECK-NEXT: .LBB0_15: +; CHECK-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: maxss %xmm1, %xmm0 +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: callq fminf@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: callq fminf@PLT +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: addq $104, %rsp +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: andps %xmm0, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: minss %xmm3, %xmm1 +; CHECK-NEXT: orps %xmm2, %xmm1 +; CHECK-NEXT: cmpunordss %xmm3, %xmm3 +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: andnps %xmm1, %xmm2 +; CHECK-NEXT: andps %xmm3, %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: addq $168, %rsp ; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; CHECK-NEXT: movaps %xmm5, %xmm4 +; CHECK-NEXT: andps %xmm2, %xmm4 +; CHECK-NEXT: minss %xmm1, %xmm5 +; CHECK-NEXT: orps %xmm4, %xmm5 +; CHECK-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: jp .LBB0_9 +; CHECK-NEXT: .LBB0_8: +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; CHECK-NEXT: movaps %xmm5, %xmm4 +; CHECK-NEXT: andps %xmm2, %xmm4 +; CHECK-NEXT: minss %xmm1, %xmm5 +; CHECK-NEXT: orps %xmm4, %xmm5 +; CHECK-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: jnp .LBB0_10 +; CHECK-NEXT: jmp .LBB0_11 %2 = fptrunc <8 x float> %0 to <8 x half> %3 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> zeroinitializer, <8 x half> %2) %4 = call <8 x half> @llvm.minnum.v8f16(<8 x half> %3, <8 x half> )